chore: add upcloud monitoring (#1835)

* chore: add upcloud monitoring

* Added db_role to new_relic metrics

---------

Co-authored-by: Adrià Casajús <adria.casajus@proton.ch>
This commit is contained in:
Carlos Quintana 2023-08-04 12:19:00 +02:00 committed by GitHub
parent 9ab3695d36
commit 0e82801512
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 537 additions and 178 deletions

View File

@ -535,3 +535,7 @@ DISABLE_RATE_LIMIT = "DISABLE_RATE_LIMIT" in os.environ
SUBSCRIPTION_CHANGE_WEBHOOK = os.environ.get("SUBSCRIPTION_CHANGE_WEBHOOK", None)
MAX_API_KEYS = int(os.environ.get("MAX_API_KEYS", 30))
UPCLOUD_USERNAME = os.environ.get("UPCLOUD_USERNAME", None)
UPCLOUD_PASSWORD = os.environ.get("UPCLOUD_PASSWORD", None)
UPCLOUD_DB_ID = os.environ.get("UPCLOUD_DB_ID", None)

0
monitor/__init__.py Normal file
View File

21
monitor/metric.py Normal file
View File

@ -0,0 +1,21 @@
from dataclasses import dataclass
from typing import List
@dataclass
class UpcloudRecord:
db_role: str
label: str
time: str
value: float
@dataclass
class UpcloudMetric:
metric_name: str
records: List[UpcloudRecord]
@dataclass
class UpcloudMetrics:
metrics: List[UpcloudMetric]

View File

@ -0,0 +1,20 @@
from app.config import UPCLOUD_DB_ID, UPCLOUD_PASSWORD, UPCLOUD_USERNAME
from app.log import LOG
from monitor.newrelic import NewRelicClient
from monitor.upcloud import UpcloudClient
class MetricExporter:
def __init__(self, newrelic_license: str):
self.__upcloud = UpcloudClient(
username=UPCLOUD_USERNAME, password=UPCLOUD_PASSWORD
)
self.__newrelic = NewRelicClient(newrelic_license)
def run(self):
try:
metrics = self.__upcloud.get_metrics(UPCLOUD_DB_ID)
self.__newrelic.send(metrics)
LOG.info("Upcloud metrics sent to NewRelic")
except Exception as e:
LOG.warn(f"Could not export metrics: {e}")

26
monitor/newrelic.py Normal file
View File

@ -0,0 +1,26 @@
from monitor.metric import UpcloudMetrics
from newrelic_telemetry_sdk import GaugeMetric, MetricClient
_NEWRELIC_BASE_HOST = "metric-api.eu.newrelic.com"
class NewRelicClient:
def __init__(self, license_key: str):
self.__client = MetricClient(license_key=license_key, host=_NEWRELIC_BASE_HOST)
def send(self, metrics: UpcloudMetrics):
batch = []
for metric in metrics.metrics:
for record in metric.records:
batch.append(
GaugeMetric(
name=f"upcloud.db.{metric.metric_name}",
value=record.value,
tags={"host": record.label, "db_role": record.db_role},
)
)
response = self.__client.send_batch(batch)
response.raise_for_status()

82
monitor/upcloud.py Normal file
View File

@ -0,0 +1,82 @@
from app.log import LOG
from monitor.metric import UpcloudMetric, UpcloudMetrics, UpcloudRecord
import base64
import requests
from typing import Any
BASE_URL = "https://api.upcloud.com"
def get_metric(json: Any, metric: str) -> UpcloudMetric:
records = []
if metric in json:
metric_data = json[metric]
data = metric_data["data"]
cols = list(map(lambda x: x["label"], data["cols"][1:]))
latest = data["rows"][-1]
time = latest[0]
for column_idx in range(len(cols)):
value = latest[1 + column_idx]
# If the latest value is None, try to fetch the second to last
if value is None:
value = data["rows"][-2][1 + column_idx]
if value is not None:
label = cols[column_idx]
if "(master)" in label:
db_role = "master"
else:
db_role = "standby"
records.append(
UpcloudRecord(time=time, db_role=db_role, label=label, value=value)
)
else:
LOG.warn(f"Could not get value for metric {metric}")
return UpcloudMetric(metric_name=metric, records=records)
def get_metrics(json: Any) -> UpcloudMetrics:
return UpcloudMetrics(
metrics=[
get_metric(json, "cpu_usage"),
get_metric(json, "disk_usage"),
get_metric(json, "diskio_reads"),
get_metric(json, "diskio_writes"),
get_metric(json, "load_average"),
get_metric(json, "mem_usage"),
get_metric(json, "net_receive"),
get_metric(json, "net_send"),
]
)
class UpcloudClient:
def __init__(self, username: str, password: str):
if not username:
raise Exception("UpcloudClient username must be set")
if not password:
raise Exception("UpcloudClient password must be set")
client = requests.Session()
encoded_auth = base64.b64encode(
f"{username}:{password}".encode("utf-8")
).decode("utf-8")
client.headers = {"Authorization": f"Basic {encoded_auth}"}
self.__client = client
def get_metrics(self, db_uuid: str) -> UpcloudMetrics:
url = f"{BASE_URL}/1.3/database/{db_uuid}/metrics?period=hour"
LOG.d(f"Performing request to {url}")
response = self.__client.get(url)
LOG.d(f"Status code: {response.status_code}")
if response.status_code != 200:
return UpcloudMetrics(metrics=[])
as_json = response.json()
return get_metrics(as_json)

View File

@ -1,3 +1,4 @@
import configparser
import os
import subprocess
from time import sleep
@ -7,6 +8,7 @@ import newrelic.agent
from app.db import Session
from app.log import LOG
from monitor.metric_exporter import MetricExporter
# the number of consecutive fails
# if more than _max_nb_fails, alert
@ -19,6 +21,18 @@ _max_nb_fails = 10
# the maximum number of emails in incoming & active queue
_max_incoming = 50
_NR_CONFIG_FILE_LOCATION_VAR = "NEW_RELIC_CONFIG_FILE"
def get_newrelic_license() -> str:
nr_file = os.environ.get(_NR_CONFIG_FILE_LOCATION_VAR, None)
if nr_file is None:
raise Exception(f"{_NR_CONFIG_FILE_LOCATION_VAR} not defined")
config = configparser.ConfigParser()
config.read(nr_file)
return config["newrelic"]["license_key"]
@newrelic.agent.background_task()
def log_postfix_metrics():
@ -80,10 +94,13 @@ def log_nb_db_connection():
if __name__ == "__main__":
exporter = MetricExporter(get_newrelic_license())
while True:
log_postfix_metrics()
log_nb_db_connection()
Session.close()
exporter.run()
# 1 min
sleep(60)

194
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -111,6 +111,7 @@ Deprecated = "^1.2.13"
cryptography = "37.0.1"
SQLAlchemy = "1.3.24"
redis = "^4.5.3"
newrelic-telemetry-sdk = "^0.5.0"
[tool.poetry.dev-dependencies]
pytest = "^7.0.0"

View File

View File

@ -0,0 +1,350 @@
from monitor.upcloud import get_metric, get_metrics
from monitor.metric import UpcloudMetrics, UpcloudMetric, UpcloudRecord
import json
MOCK_RESPONSE = """
{
"cpu_usage": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 2.744682398273781, 3.054323473090861],
["2022-01-21T13:11:00Z", 3.0735645433218366, 2.972423595745795],
["2022-01-21T13:11:30Z", 2.61619694060839, 3.1358378052207883],
["2022-01-21T13:12:00Z", 3.275132296130991, 4.196249043309251]
]
},
"hints": { "title": "CPU usage %" }
},
"disk_usage": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 5.654416415900109, 5.58959125727556],
["2022-01-21T13:11:00Z", 5.654416415900109, 5.58959125727556],
["2022-01-21T13:11:30Z", 5.654416415900109, 5.58959125727556]
]
},
"hints": { "title": "Disk space usage %" }
},
"diskio_reads": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 0, 0],
["2022-01-21T13:11:00Z", 0, 0],
["2022-01-21T13:11:30Z", 0, 0]
]
},
"hints": { "title": "Disk iops (reads)" }
},
"diskio_writes": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 3, 2],
["2022-01-21T13:11:00Z", 2, 3],
["2022-01-21T13:11:30Z", 4, 3]
]
},
"hints": { "title": "Disk iops (writes)" }
},
"load_average": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 0.11, 0.11],
["2022-01-21T13:11:00Z", 0.14, 0.1],
["2022-01-21T13:11:30Z", 0.14, 0.09]
]
},
"hints": { "title": "Load average (5 min)" }
},
"mem_usage": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 11.491766148261078, 12.318932883261219],
["2022-01-21T13:11:00Z", 11.511967645759277, 12.304403727425075],
["2022-01-21T13:11:30Z", 11.488581675749048, 12.272260458006759]
]
},
"hints": { "title": "Memory usage %" }
},
"net_receive": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 442, 470],
["2022-01-21T13:11:00Z", 439, 384],
["2022-01-21T13:11:30Z", 466, 458]
]
},
"hints": { "title": "Network receive (bytes/s)" }
},
"net_send": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 672, 581],
["2022-01-21T13:11:00Z", 660, 555],
["2022-01-21T13:11:30Z", 694, 573]
]
},
"hints": { "title": "Network transmit (bytes/s)" }
}
}
"""
def test_get_metrics():
response = json.loads(MOCK_RESPONSE)
metrics = get_metrics(response)
assert metrics == UpcloudMetrics(
metrics=[
UpcloudMetric(
metric_name="cpu_usage",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:12:00Z",
value=3.275132296130991,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:12:00Z",
value=4.196249043309251,
),
],
),
UpcloudMetric(
metric_name="disk_usage",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=5.654416415900109,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=5.58959125727556,
),
],
),
UpcloudMetric(
metric_name="diskio_reads",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=0,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=0,
),
],
),
UpcloudMetric(
metric_name="diskio_writes",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=4,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=3,
),
],
),
UpcloudMetric(
metric_name="load_average",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=0.14,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=0.09,
),
],
),
UpcloudMetric(
metric_name="mem_usage",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=11.488581675749048,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=12.272260458006759,
),
],
),
UpcloudMetric(
metric_name="net_receive",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=466,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=458,
),
],
),
UpcloudMetric(
metric_name="net_send",
records=[
UpcloudRecord(
db_role="master",
label="test-1 " "(master)",
time="2022-01-21T13:11:30Z",
value=694,
),
UpcloudRecord(
db_role="standby",
label="test-2 " "(standby)",
time="2022-01-21T13:11:30Z",
value=573,
),
],
),
]
)
def test_get_metric():
response = json.loads(MOCK_RESPONSE)
metric_name = "cpu_usage"
metric = get_metric(response, metric_name)
assert metric.metric_name == metric_name
assert len(metric.records) == 2
assert metric.records[0].label == "test-1 (master)"
assert metric.records[0].time == "2022-01-21T13:12:00Z"
assert metric.records[0].value == 3.275132296130991
assert metric.records[1].label == "test-2 (standby)"
assert metric.records[1].time == "2022-01-21T13:12:00Z"
assert metric.records[1].value == 4.196249043309251
def test_get_metric_with_none_value():
response_str = """
{
"cpu_usage": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 2.744682398273781, 3.054323473090861],
["2022-01-21T13:11:00Z", 3.0735645433218366, 2.972423595745795],
["2022-01-21T13:11:30Z", null, 3.1358378052207883],
["2022-01-21T13:12:00Z", 3.275132296130991, null]
]
},
"hints": { "title": "CPU usage %" }
}
}
"""
response = json.loads(response_str)
metric = get_metric(response, "cpu_usage")
assert metric.records[0].label == "test-1 (master)"
assert metric.records[0].value == 3.275132296130991
assert metric.records[1].label == "test-2 (standby)"
assert metric.records[1].value == 3.1358378052207883
def test_get_metric_with_none_value_in_last_two_positions():
response_str = """
{
"cpu_usage": {
"data": {
"cols": [
{ "label": "time", "type": "date" },
{ "label": "test-1 (master)", "type": "number" },
{ "label": "test-2 (standby)", "type": "number" }
],
"rows": [
["2022-01-21T13:10:30Z", 2.744682398273781, 3.054323473090861],
["2022-01-21T13:11:00Z", 3.0735645433218366, 2.972423595745795],
["2022-01-21T13:11:30Z", null, null],
["2022-01-21T13:12:00Z", 3.275132296130991, null]
]
},
"hints": { "title": "CPU usage %" }
}
}
"""
response = json.loads(response_str)
metric = get_metric(response, "cpu_usage")
assert len(metric.records) == 1
assert metric.records[0].label == "test-1 (master)"
assert metric.records[0].value == 3.275132296130991