switch to official Prometheus client

This commit is contained in:
Jeffrey C. Ollie 2023-09-01 13:10:25 -05:00
parent 514fa50a8c
commit b0628543f8
No known key found for this signature in database
GPG key ID: F936E4DCB7E25F15
3 changed files with 282 additions and 217 deletions

17
poetry.lock generated
View file

@ -565,6 +565,21 @@ files = [
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
[[package]]
name = "prometheus-client"
version = "0.17.1"
description = "Python client for the Prometheus monitoring system."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "prometheus_client-0.17.1-py3-none-any.whl", hash = "sha256:e537f37160f6807b8202a6fc4764cdd19bac5480ddd3e0d463c3002b34462101"},
{file = "prometheus_client-0.17.1.tar.gz", hash = "sha256:21e674f39831ae3f8acde238afd9a27a37d0d2fb5a28ea094f0ce25d2cbf2091"},
]
[package.extras]
twisted = ["twisted"]
[[package]]
name = "protobuf"
version = "4.24.2"
@ -770,4 +785,4 @@ test = ["pytest", "pytest-grpc"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "d0558513b225264653cde8153abe3425004ee3298c1407cbb2dc672fe356b2f3"
content-hash = "9756b224be33434f1cf2d6702163ed6e6ffb37f40d608ea6be5e0a832d5e8bfe"

View file

@ -17,6 +17,7 @@ influxdb = "^5.3.1"
influxdb-client = "^1.37.0"
pypng = "^0.20220715.0"
typing-extensions = "^4.7.1"
prometheus-client = "^0.17.1"
[tool.poetry.group.dev.dependencies]
black = "^23.7.0"
@ -32,7 +33,7 @@ build-backend = "poetry.core.masonry.api"
[tool.isort]
profile = "black"
line_length = 88
line_length = 120
force_single_line = true
force_sort_within_sections = true
from_first = false

View file

@ -1,17 +1,23 @@
#!/usr/bin/python3
"""Prometheus exporter for Starlink user terminal data info.
This script pulls the current status info and/or metrics computed from the
history data and makes it available via HTTP in the format Prometheus expects.
"""
from http import HTTPStatus
from http.server import BaseHTTPRequestHandler
from http.server import ThreadingHTTPServer
import logging
import signal
import sys
import threading
import time
from typing import Self
from prometheus_client import Counter
from prometheus_client import Enum
from prometheus_client import Gauge
from prometheus_client import Info
from prometheus_client import MetricsHandler
import starlink_grpc_tools.dish_common as dish_common
@ -25,57 +31,174 @@ def handle_sigterm(signum, frame):
raise Terminated
class MetricInfo:
unit = ""
kind = "gauge"
help = ""
common_labels = ["id"]
def __init__(self, unit=None, kind=None, help=None) -> None:
if unit:
self.unit = f"_{unit}"
if kind:
self.kind = kind
if help:
self.help = help
pass
METRICS_INFO = {
"status_uptime": MetricInfo(unit="seconds", kind="counter"),
"status_seconds_to_first_nonempty_slot": MetricInfo(),
"status_pop_ping_drop_rate": MetricInfo(),
"status_downlink_throughput_bps": MetricInfo(),
"status_uplink_throughput_bps": MetricInfo(),
"status_pop_ping_latency_ms": MetricInfo(),
"status_alerts": MetricInfo(),
"status_fraction_obstructed": MetricInfo(),
"status_currently_obstructed": MetricInfo(),
"status_seconds_obstructed": MetricInfo(),
"status_obstruction_duration": MetricInfo(),
"status_obstruction_interval": MetricInfo(),
"status_direction_azimuth": MetricInfo(),
"status_direction_elevation": MetricInfo(),
"status_is_snr_above_noise_floor": MetricInfo(),
"status_alert_motors_stuck": MetricInfo(),
"status_alert_thermal_throttle": MetricInfo(),
"status_alert_thermal_shutdown": MetricInfo(),
"status_alert_mast_not_near_vertical": MetricInfo(),
"status_alert_unexpected_location": MetricInfo(),
"status_alert_slow_ethernet_speeds": MetricInfo(),
"status_alert_roaming": MetricInfo(),
"status_alert_install_pending": MetricInfo(),
"status_alert_is_heating": MetricInfo(),
"status_alert_power_supply_thermal_throttle": MetricInfo(),
"status_alert_is_power_save_idle": MetricInfo(),
"status_alert_moving_fast_while_not_aviation": MetricInfo(),
"status_alert_moving_while_not_mobile": MetricInfo(),
"ping_stats_samples": MetricInfo(kind="counter"),
"ping_stats_end_counter": MetricInfo(kind="counter"),
"usage_download_usage": MetricInfo(unit="bytes", kind="counter"),
"usage_upload_usage": MetricInfo(unit="bytes", kind="counter"),
}
STATE_VALUES = [
METRICS: dict[str, Counter | Enum | Gauge] = {
"status_uptime": Gauge(
"starlink_status_uptime_seconds",
"",
common_labels,
),
"status_seconds_to_first_nonempty_slot": Gauge(
"starlink_status_seconds_to_first_nonempty_slot",
"",
common_labels,
),
"status_pop_ping_drop_rate": Gauge(
"starlink_status_pop_ping_drop_rate",
"",
common_labels,
),
"status_downlink_throughput_bps": Gauge(
"starlink_status_downlink_throughput_bps",
"",
common_labels,
),
"status_uplink_throughput_bps": Gauge(
"starlink_status_uplink_throughput_bps",
"",
common_labels,
),
"status_pop_ping_latency_ms": Gauge(
"starlink_status_pop_ping_latency_ms",
"",
common_labels,
),
"status_alerts": Gauge(
"starlink_status_alerts",
"",
common_labels,
),
"status_fraction_obstructed": Gauge(
"starlink_status_fraction_obstructed",
"",
common_labels,
),
"status_currently_obstructed": Gauge(
"starlink_status_currently_obstructed",
"",
common_labels,
),
"status_seconds_obstructed": Gauge(
"starlink_status_seconds_obstructed",
"",
common_labels,
),
"status_obstruction_duration": Gauge(
"starlink_status_obstruction_duration",
"",
common_labels,
),
"status_obstruction_interval": Gauge(
"starlink_status_obstruction_interval",
"",
common_labels,
),
"status_direction_azimuth": Gauge(
"starlink_status_direction_azimuth",
"",
common_labels,
),
"status_direction_elevation": Gauge(
"starlink_status_direction_elevation",
"",
common_labels,
),
"status_is_snr_above_noise_floor": Gauge(
"starlink_status_is_snr_above_noise_floor",
"",
common_labels,
),
"status_alert_motors_stuck": Gauge(
"starlink_status_alert_motors_stuck",
"",
common_labels,
),
"status_alert_thermal_throttle": Gauge(
"starlink_status_alert_thermal_throttle",
"",
common_labels,
),
"status_alert_thermal_shutdown": Gauge(
"starlink_status_alert_thermal_shutdown",
"",
common_labels,
),
"status_alert_mast_not_near_vertical": Gauge(
"starlink_status_alert_mast_not_near_vertical",
"",
common_labels,
),
"status_alert_unexpected_location": Gauge(
"starlink_status_alert_unexpected_location",
"",
common_labels,
),
"status_alert_slow_ethernet_speeds": Gauge(
"starlink_status_alert_slow_ethernet_speeds",
"",
common_labels,
),
"status_alert_roaming": Gauge(
"starlink_status_alert_roaming",
"",
common_labels,
),
"status_alert_install_pending": Gauge(
"starlink_status_alert_install_pending",
"",
common_labels,
),
"status_alert_is_heating": Gauge(
"starlink_status_alert_is_heating",
"",
common_labels,
),
"status_alert_power_supply_thermal_throttle": Gauge(
"starlink_status_alert_power_supply_thermal_throttle",
"",
common_labels,
),
"status_alert_is_power_save_idle": Gauge(
"starlink_status_alert_is_power_save_idle",
"",
common_labels,
),
"status_alert_moving_fast_while_not_aviation": Gauge(
"starlink_status_alert_moving_fast_while_not_aviation",
"",
common_labels,
),
"status_alert_moving_while_not_mobile": Gauge(
"starlink_status_alert_moving_while_not_mobile",
"",
common_labels,
),
"ping_stats_samples": Gauge(
"starlink_ping_stats_samples",
"",
common_labels,
),
"ping_stats_end_counter": Gauge(
"starlink_ping_stats_end_counter",
"",
common_labels,
),
"usage_download_usage": Gauge(
"starlink_usage_download_usage_bytes",
"",
common_labels,
),
"usage_upload_usage": Gauge(
"starlink_usage_upload_usage_bytes",
"",
common_labels,
),
"status_state": Enum(
"starlink_status_state",
"",
common_labels,
states=[
"UNKNOWN",
"CONNECTED",
"BOOTING",
@ -87,57 +210,25 @@ STATE_VALUES = [
"NO_DOWNLINK",
"NO_PINGS",
"DISH_UNREACHABLE",
]
],
),
}
class Metric:
name = ""
timestamp = ""
kind = None
help = None
values = None
def __init__(self, name, timestamp, kind="gauge", help="", values=None):
self.name = name
self.timestamp = timestamp
self.kind = kind
self.help = help
if values:
self.values = values
else:
self.values = []
pass
def __str__(self):
if not self.values:
return ""
lines = []
lines.append(f"# HELP {self.name} {self.help}")
lines.append(f"# TYPE {self.name} {self.kind}")
for value in self.values:
lines.append(f"{self.name}{value} {self.timestamp*1000}")
lines.append("")
return str.join("\n", lines)
class MetricValue:
value = 0
labels = None
def __init__(self, value, labels=None) -> None:
self.value = value
self.labels = labels
def __str__(self):
label_str = ""
if self.labels:
label_str = (
"{"
+ str.join(",", [f'{v[0]}="{v[1]}"' for v in self.labels.items()])
+ "}"
)
return f"{label_str} {self.value}"
info = Info(
"starlink_info",
"",
# common_labels,
)
unprocessed_metrics = Gauge(
"starlink_unprocessed_metrics",
"",
common_labels + ["metric"],
)
missing_metrics = Gauge(
"starlink_missing_metrics",
"",
common_labels + ["metric"],
)
def parse_args():
@ -152,119 +243,76 @@ def parse_args():
return dish_common.run_arg_parser(parser, modes=["status", "alert_detail", "usage"])
def prometheus_export(opts, gstate):
class GatherMetrics(threading.Thread):
def __init__(self: Self, opts, gstate, *args, **kw):
self.opts = opts
self.gstate = gstate
super().__init__(*args, **kw)
def run(self: Self):
while True:
self.gather()
time.sleep(5.0)
def gather(self: Self) -> None:
raw_data = {}
def data_add_item(name, value, category):
raw_data[category + "_" + name] = value
pass
def data_add_sequencem(name, value, category, start):
raise NotImplementedError("Did not expect sequence data")
with gstate.lock:
rc, status_ts, hist_ts = dish_common.get_data(
opts, gstate, data_add_item, data_add_sequencem
raise NotImplementedError(
f"Did not expect sequence data {name!r} {value!r} {category!r} {start!r}"
)
metrics = []
with self.gstate.lock:
rc, status_ts, hist_ts = dish_common.get_data(
self.opts, self.gstate, data_add_item, data_add_sequencem
)
# snr is not supported by starlink any more but still returned by the grpc
# service for backwards compatibility
if "status_snr" in raw_data:
del raw_data["status_snr"]
metrics.append(
Metric(
name="starlink_status_state",
timestamp=status_ts,
values=[
MetricValue(
value=int(raw_data["status_state"] == state_value),
labels={"state": state_value},
)
for state_value in STATE_VALUES
],
)
)
del raw_data["status_state"]
status_id = raw_data.get("status_id")
info_metrics = ["status_id", "status_hardware_version", "status_software_version"]
info_metrics = [
"status_id",
"status_hardware_version",
"status_software_version",
]
metrics_not_found = []
metrics_not_found.extend([x for x in info_metrics if x not in raw_data])
if len(metrics_not_found) < len(info_metrics):
metrics.append(
Metric(
name="starlink_info",
timestamp=status_ts,
values=[
MetricValue(
value=1,
labels={
info.info(
{
x.replace("status_", ""): raw_data.pop(x)
for x in info_metrics
if x in raw_data
},
)
],
)
}
)
for name, metric_info in METRICS_INFO.items():
for name, metric_info in METRICS.items():
if name in raw_data:
metrics.append(
Metric(
name=f"starlink_{name}{metric_info.unit}",
timestamp=status_ts,
kind=metric_info.kind,
values=[MetricValue(value=float(raw_data.pop(name) or 0))],
)
)
match metric_info:
case Gauge():
metric_info.labels(id=status_id).set(raw_data.pop(name) or 0)
case Enum():
metric_info.labels(id=status_id).state(raw_data.pop(name) or 0)
case _:
pass
else:
metrics_not_found.append(name)
metrics.append(
Metric(
name="starlink_exporter_unprocessed_metrics",
timestamp=status_ts,
values=[MetricValue(value=1, labels={"metric": name}) for name in raw_data],
)
)
for name in raw_data:
unprocessed_metrics.labels(id=status_id, metric=name).set(1)
metrics.append(
Metric(
name="starlink_exporter_missing_metrics",
timestamp=status_ts,
values=[
MetricValue(
value=1,
labels={"metric": name},
)
for name in metrics_not_found
],
)
)
return str.join("\n", [str(metric) for metric in metrics])
class MetricsRequestHandler(BaseHTTPRequestHandler):
def do_GET(self):
path = self.path.partition("?")[0]
if path.lower() == "/favicon.ico":
self.send_error(HTTPStatus.NOT_FOUND)
return
opts = self.server.opts
gstate = self.server.gstate
content = prometheus_export(opts, gstate)
self.send_response(HTTPStatus.OK)
self.send_header("Content-type", "text/plain")
self.send_header("Content-Length", len(content))
self.end_headers()
self.wfile.write(content.encode())
for name in metrics_not_found:
missing_metrics.labels(id=status_id, metric=name).set(1)
def main():
@ -275,10 +323,11 @@ def main():
gstate = dish_common.GlobalState(target=opts.target)
gstate.lock = threading.Lock()
httpd = ThreadingHTTPServer((opts.address, opts.port), MetricsRequestHandler)
gather = GatherMetrics(opts, gstate)
gather.start()
httpd = ThreadingHTTPServer((opts.address, opts.port), MetricsHandler)
httpd.daemon_threads = False
httpd.opts = opts
httpd.gstate = gstate
signal.signal(signal.SIGTERM, handle_sigterm)
@ -289,7 +338,7 @@ def main():
pass
finally:
httpd.server_close()
httpd.gstate.shutdown()
gstate.shutdown()
sys.exit()