From b0628543f844da6d66c7656a73ce3ebb3ae2f6e2 Mon Sep 17 00:00:00 2001 From: "Jeffrey C. Ollie" Date: Fri, 1 Sep 2023 13:10:25 -0500 Subject: [PATCH] switch to official Prometheus client --- poetry.lock | 17 +- pyproject.toml | 3 +- starlink_grpc_tools/dish_grpc_prometheus.py | 479 +++++++++++--------- 3 files changed, 282 insertions(+), 217 deletions(-) diff --git a/poetry.lock b/poetry.lock index 927753e..ffbc7be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -565,6 +565,21 @@ files = [ docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] +[[package]] +name = "prometheus-client" +version = "0.17.1" +description = "Python client for the Prometheus monitoring system." +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "prometheus_client-0.17.1-py3-none-any.whl", hash = "sha256:e537f37160f6807b8202a6fc4764cdd19bac5480ddd3e0d463c3002b34462101"}, + {file = "prometheus_client-0.17.1.tar.gz", hash = "sha256:21e674f39831ae3f8acde238afd9a27a37d0d2fb5a28ea094f0ce25d2cbf2091"}, +] + +[package.extras] +twisted = ["twisted"] + [[package]] name = "protobuf" version = "4.24.2" @@ -770,4 +785,4 @@ test = ["pytest", "pytest-grpc"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "d0558513b225264653cde8153abe3425004ee3298c1407cbb2dc672fe356b2f3" +content-hash = "9756b224be33434f1cf2d6702163ed6e6ffb37f40d608ea6be5e0a832d5e8bfe" diff --git a/pyproject.toml b/pyproject.toml index 14346d1..a14e863 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ influxdb = "^5.3.1" influxdb-client = "^1.37.0" pypng = "^0.20220715.0" typing-extensions = "^4.7.1" +prometheus-client = "^0.17.1" [tool.poetry.group.dev.dependencies] black = "^23.7.0" @@ -32,7 +33,7 @@ build-backend = "poetry.core.masonry.api" [tool.isort] profile = "black" -line_length = 88 +line_length = 120 force_single_line = true force_sort_within_sections = true from_first = false diff --git a/starlink_grpc_tools/dish_grpc_prometheus.py b/starlink_grpc_tools/dish_grpc_prometheus.py index 7361bbc..afb3a5d 100644 --- a/starlink_grpc_tools/dish_grpc_prometheus.py +++ b/starlink_grpc_tools/dish_grpc_prometheus.py @@ -1,17 +1,23 @@ -#!/usr/bin/python3 """Prometheus exporter for Starlink user terminal data info. This script pulls the current status info and/or metrics computed from the history data and makes it available via HTTP in the format Prometheus expects. """ -from http import HTTPStatus -from http.server import BaseHTTPRequestHandler + from http.server import ThreadingHTTPServer import logging import signal import sys import threading +import time +from typing import Self + +from prometheus_client import Counter +from prometheus_client import Enum +from prometheus_client import Gauge +from prometheus_client import Info +from prometheus_client import MetricsHandler import starlink_grpc_tools.dish_common as dish_common @@ -25,119 +31,204 @@ def handle_sigterm(signum, frame): raise Terminated -class MetricInfo: - unit = "" - kind = "gauge" - help = "" +common_labels = ["id"] - def __init__(self, unit=None, kind=None, help=None) -> None: - if unit: - self.unit = f"_{unit}" - if kind: - self.kind = kind - if help: - self.help = help - pass - - -METRICS_INFO = { - "status_uptime": MetricInfo(unit="seconds", kind="counter"), - "status_seconds_to_first_nonempty_slot": MetricInfo(), - "status_pop_ping_drop_rate": MetricInfo(), - "status_downlink_throughput_bps": MetricInfo(), - "status_uplink_throughput_bps": MetricInfo(), - "status_pop_ping_latency_ms": MetricInfo(), - "status_alerts": MetricInfo(), - "status_fraction_obstructed": MetricInfo(), - "status_currently_obstructed": MetricInfo(), - "status_seconds_obstructed": MetricInfo(), - "status_obstruction_duration": MetricInfo(), - "status_obstruction_interval": MetricInfo(), - "status_direction_azimuth": MetricInfo(), - "status_direction_elevation": MetricInfo(), - "status_is_snr_above_noise_floor": MetricInfo(), - "status_alert_motors_stuck": MetricInfo(), - "status_alert_thermal_throttle": MetricInfo(), - "status_alert_thermal_shutdown": MetricInfo(), - "status_alert_mast_not_near_vertical": MetricInfo(), - "status_alert_unexpected_location": MetricInfo(), - "status_alert_slow_ethernet_speeds": MetricInfo(), - "status_alert_roaming": MetricInfo(), - "status_alert_install_pending": MetricInfo(), - "status_alert_is_heating": MetricInfo(), - "status_alert_power_supply_thermal_throttle": MetricInfo(), - "status_alert_is_power_save_idle": MetricInfo(), - "status_alert_moving_fast_while_not_aviation": MetricInfo(), - "status_alert_moving_while_not_mobile": MetricInfo(), - "ping_stats_samples": MetricInfo(kind="counter"), - "ping_stats_end_counter": MetricInfo(kind="counter"), - "usage_download_usage": MetricInfo(unit="bytes", kind="counter"), - "usage_upload_usage": MetricInfo(unit="bytes", kind="counter"), +METRICS: dict[str, Counter | Enum | Gauge] = { + "status_uptime": Gauge( + "starlink_status_uptime_seconds", + "", + common_labels, + ), + "status_seconds_to_first_nonempty_slot": Gauge( + "starlink_status_seconds_to_first_nonempty_slot", + "", + common_labels, + ), + "status_pop_ping_drop_rate": Gauge( + "starlink_status_pop_ping_drop_rate", + "", + common_labels, + ), + "status_downlink_throughput_bps": Gauge( + "starlink_status_downlink_throughput_bps", + "", + common_labels, + ), + "status_uplink_throughput_bps": Gauge( + "starlink_status_uplink_throughput_bps", + "", + common_labels, + ), + "status_pop_ping_latency_ms": Gauge( + "starlink_status_pop_ping_latency_ms", + "", + common_labels, + ), + "status_alerts": Gauge( + "starlink_status_alerts", + "", + common_labels, + ), + "status_fraction_obstructed": Gauge( + "starlink_status_fraction_obstructed", + "", + common_labels, + ), + "status_currently_obstructed": Gauge( + "starlink_status_currently_obstructed", + "", + common_labels, + ), + "status_seconds_obstructed": Gauge( + "starlink_status_seconds_obstructed", + "", + common_labels, + ), + "status_obstruction_duration": Gauge( + "starlink_status_obstruction_duration", + "", + common_labels, + ), + "status_obstruction_interval": Gauge( + "starlink_status_obstruction_interval", + "", + common_labels, + ), + "status_direction_azimuth": Gauge( + "starlink_status_direction_azimuth", + "", + common_labels, + ), + "status_direction_elevation": Gauge( + "starlink_status_direction_elevation", + "", + common_labels, + ), + "status_is_snr_above_noise_floor": Gauge( + "starlink_status_is_snr_above_noise_floor", + "", + common_labels, + ), + "status_alert_motors_stuck": Gauge( + "starlink_status_alert_motors_stuck", + "", + common_labels, + ), + "status_alert_thermal_throttle": Gauge( + "starlink_status_alert_thermal_throttle", + "", + common_labels, + ), + "status_alert_thermal_shutdown": Gauge( + "starlink_status_alert_thermal_shutdown", + "", + common_labels, + ), + "status_alert_mast_not_near_vertical": Gauge( + "starlink_status_alert_mast_not_near_vertical", + "", + common_labels, + ), + "status_alert_unexpected_location": Gauge( + "starlink_status_alert_unexpected_location", + "", + common_labels, + ), + "status_alert_slow_ethernet_speeds": Gauge( + "starlink_status_alert_slow_ethernet_speeds", + "", + common_labels, + ), + "status_alert_roaming": Gauge( + "starlink_status_alert_roaming", + "", + common_labels, + ), + "status_alert_install_pending": Gauge( + "starlink_status_alert_install_pending", + "", + common_labels, + ), + "status_alert_is_heating": Gauge( + "starlink_status_alert_is_heating", + "", + common_labels, + ), + "status_alert_power_supply_thermal_throttle": Gauge( + "starlink_status_alert_power_supply_thermal_throttle", + "", + common_labels, + ), + "status_alert_is_power_save_idle": Gauge( + "starlink_status_alert_is_power_save_idle", + "", + common_labels, + ), + "status_alert_moving_fast_while_not_aviation": Gauge( + "starlink_status_alert_moving_fast_while_not_aviation", + "", + common_labels, + ), + "status_alert_moving_while_not_mobile": Gauge( + "starlink_status_alert_moving_while_not_mobile", + "", + common_labels, + ), + "ping_stats_samples": Gauge( + "starlink_ping_stats_samples", + "", + common_labels, + ), + "ping_stats_end_counter": Gauge( + "starlink_ping_stats_end_counter", + "", + common_labels, + ), + "usage_download_usage": Gauge( + "starlink_usage_download_usage_bytes", + "", + common_labels, + ), + "usage_upload_usage": Gauge( + "starlink_usage_upload_usage_bytes", + "", + common_labels, + ), + "status_state": Enum( + "starlink_status_state", + "", + common_labels, + states=[ + "UNKNOWN", + "CONNECTED", + "BOOTING", + "SEARCHING", + "STOWED", + "THERMAL_SHUTDOWN", + "NO_SATS", + "OBSTRUCTED", + "NO_DOWNLINK", + "NO_PINGS", + "DISH_UNREACHABLE", + ], + ), } -STATE_VALUES = [ - "UNKNOWN", - "CONNECTED", - "BOOTING", - "SEARCHING", - "STOWED", - "THERMAL_SHUTDOWN", - "NO_SATS", - "OBSTRUCTED", - "NO_DOWNLINK", - "NO_PINGS", - "DISH_UNREACHABLE", -] - - -class Metric: - name = "" - timestamp = "" - kind = None - help = None - values = None - - def __init__(self, name, timestamp, kind="gauge", help="", values=None): - self.name = name - self.timestamp = timestamp - self.kind = kind - self.help = help - if values: - self.values = values - else: - self.values = [] - pass - - def __str__(self): - if not self.values: - return "" - - lines = [] - lines.append(f"# HELP {self.name} {self.help}") - lines.append(f"# TYPE {self.name} {self.kind}") - for value in self.values: - lines.append(f"{self.name}{value} {self.timestamp*1000}") - lines.append("") - return str.join("\n", lines) - - -class MetricValue: - value = 0 - labels = None - - def __init__(self, value, labels=None) -> None: - self.value = value - self.labels = labels - - def __str__(self): - label_str = "" - if self.labels: - label_str = ( - "{" - + str.join(",", [f'{v[0]}="{v[1]}"' for v in self.labels.items()]) - + "}" - ) - return f"{label_str} {self.value}" +info = Info( + "starlink_info", + "", + # common_labels, +) +unprocessed_metrics = Gauge( + "starlink_unprocessed_metrics", + "", + common_labels + ["metric"], +) +missing_metrics = Gauge( + "starlink_missing_metrics", + "", + common_labels + ["metric"], +) def parse_args(): @@ -152,119 +243,76 @@ def parse_args(): return dish_common.run_arg_parser(parser, modes=["status", "alert_detail", "usage"]) -def prometheus_export(opts, gstate): - raw_data = {} +class GatherMetrics(threading.Thread): + def __init__(self: Self, opts, gstate, *args, **kw): + self.opts = opts + self.gstate = gstate + super().__init__(*args, **kw) - def data_add_item(name, value, category): - raw_data[category + "_" + name] = value - pass + def run(self: Self): + while True: + self.gather() + time.sleep(5.0) - def data_add_sequencem(name, value, category, start): - raise NotImplementedError("Did not expect sequence data") + def gather(self: Self) -> None: + raw_data = {} - with gstate.lock: - rc, status_ts, hist_ts = dish_common.get_data( - opts, gstate, data_add_item, data_add_sequencem - ) + def data_add_item(name, value, category): + raw_data[category + "_" + name] = value - metrics = [] - - # snr is not supported by starlink any more but still returned by the grpc - # service for backwards compatibility - if "status_snr" in raw_data: - del raw_data["status_snr"] - - metrics.append( - Metric( - name="starlink_status_state", - timestamp=status_ts, - values=[ - MetricValue( - value=int(raw_data["status_state"] == state_value), - labels={"state": state_value}, - ) - for state_value in STATE_VALUES - ], - ) - ) - del raw_data["status_state"] - - info_metrics = ["status_id", "status_hardware_version", "status_software_version"] - metrics_not_found = [] - metrics_not_found.extend([x for x in info_metrics if x not in raw_data]) - - if len(metrics_not_found) < len(info_metrics): - metrics.append( - Metric( - name="starlink_info", - timestamp=status_ts, - values=[ - MetricValue( - value=1, - labels={ - x.replace("status_", ""): raw_data.pop(x) - for x in info_metrics - if x in raw_data - }, - ) - ], + def data_add_sequencem(name, value, category, start): + raise NotImplementedError( + f"Did not expect sequence data {name!r} {value!r} {category!r} {start!r}" ) - ) - for name, metric_info in METRICS_INFO.items(): - if name in raw_data: - metrics.append( - Metric( - name=f"starlink_{name}{metric_info.unit}", - timestamp=status_ts, - kind=metric_info.kind, - values=[MetricValue(value=float(raw_data.pop(name) or 0))], - ) + with self.gstate.lock: + rc, status_ts, hist_ts = dish_common.get_data( + self.opts, self.gstate, data_add_item, data_add_sequencem ) - else: - metrics_not_found.append(name) - metrics.append( - Metric( - name="starlink_exporter_unprocessed_metrics", - timestamp=status_ts, - values=[MetricValue(value=1, labels={"metric": name}) for name in raw_data], + # snr is not supported by starlink any more but still returned by the grpc + # service for backwards compatibility + if "status_snr" in raw_data: + del raw_data["status_snr"] + + status_id = raw_data.get("status_id") + + info_metrics = [ + "status_id", + "status_hardware_version", + "status_software_version", + ] + metrics_not_found = [] + metrics_not_found.extend([x for x in info_metrics if x not in raw_data]) + + info.info( + { + x.replace("status_", ""): raw_data.pop(x) + for x in info_metrics + if x in raw_data + } ) - ) - metrics.append( - Metric( - name="starlink_exporter_missing_metrics", - timestamp=status_ts, - values=[ - MetricValue( - value=1, - labels={"metric": name}, - ) - for name in metrics_not_found - ], - ) - ) + for name, metric_info in METRICS.items(): + if name in raw_data: + match metric_info: + case Gauge(): + metric_info.labels(id=status_id).set(raw_data.pop(name) or 0) - return str.join("\n", [str(metric) for metric in metrics]) + case Enum(): + metric_info.labels(id=status_id).state(raw_data.pop(name) or 0) + case _: + pass -class MetricsRequestHandler(BaseHTTPRequestHandler): - def do_GET(self): - path = self.path.partition("?")[0] - if path.lower() == "/favicon.ico": - self.send_error(HTTPStatus.NOT_FOUND) - return + else: + metrics_not_found.append(name) - opts = self.server.opts - gstate = self.server.gstate + for name in raw_data: + unprocessed_metrics.labels(id=status_id, metric=name).set(1) - content = prometheus_export(opts, gstate) - self.send_response(HTTPStatus.OK) - self.send_header("Content-type", "text/plain") - self.send_header("Content-Length", len(content)) - self.end_headers() - self.wfile.write(content.encode()) + for name in metrics_not_found: + missing_metrics.labels(id=status_id, metric=name).set(1) def main(): @@ -275,10 +323,11 @@ def main(): gstate = dish_common.GlobalState(target=opts.target) gstate.lock = threading.Lock() - httpd = ThreadingHTTPServer((opts.address, opts.port), MetricsRequestHandler) + gather = GatherMetrics(opts, gstate) + gather.start() + + httpd = ThreadingHTTPServer((opts.address, opts.port), MetricsHandler) httpd.daemon_threads = False - httpd.opts = opts - httpd.gstate = gstate signal.signal(signal.SIGTERM, handle_sigterm) @@ -289,7 +338,7 @@ def main(): pass finally: httpd.server_close() - httpd.gstate.shutdown() + gstate.shutdown() sys.exit()