starlink-grpc-tools/starlink_grpc_tools/dish_grpc_prometheus.py
2023-09-01 13:30:54 -05:00

352 lines
9.1 KiB
Python

"""Prometheus exporter for Starlink user terminal data info.
This script pulls the current status info and/or metrics computed from the
history data and makes it available via HTTP in the format Prometheus expects.
"""
from http.server import ThreadingHTTPServer
import logging
import signal
import sys
import threading
import time
from typing import Self
from prometheus_client import Counter
from prometheus_client import Enum
from prometheus_client import Gauge
from prometheus_client import Info
from prometheus_client import MetricsHandler
import starlink_grpc_tools.dish_common as dish_common
from starlink_grpc_tools.logging import setup_logging
logger = logging.getLogger(__name__)
class Terminated(Exception):
pass
def handle_sigterm(signum, frame):
# Turn SIGTERM into an exception so main loop can clean up
raise Terminated
common_labels = ["id"]
METRICS: dict[str, Counter | Enum | Gauge] = {
"status_uptime": Gauge(
"starlink_status_uptime_seconds",
"",
common_labels,
),
"status_seconds_to_first_nonempty_slot": Gauge(
"starlink_status_seconds_to_first_nonempty_slot",
"",
common_labels,
),
"status_pop_ping_drop_rate": Gauge(
"starlink_status_pop_ping_drop_rate",
"",
common_labels,
),
"status_downlink_throughput_bps": Gauge(
"starlink_status_downlink_throughput_bps",
"",
common_labels,
),
"status_uplink_throughput_bps": Gauge(
"starlink_status_uplink_throughput_bps",
"",
common_labels,
),
"status_pop_ping_latency_ms": Gauge(
"starlink_status_pop_ping_latency_ms",
"",
common_labels,
),
"status_alerts": Gauge(
"starlink_status_alerts",
"",
common_labels,
),
"status_fraction_obstructed": Gauge(
"starlink_status_fraction_obstructed",
"",
common_labels,
),
"status_currently_obstructed": Gauge(
"starlink_status_currently_obstructed",
"",
common_labels,
),
"status_seconds_obstructed": Gauge(
"starlink_status_seconds_obstructed",
"",
common_labels,
),
"status_obstruction_duration": Gauge(
"starlink_status_obstruction_duration",
"",
common_labels,
),
"status_obstruction_interval": Gauge(
"starlink_status_obstruction_interval",
"",
common_labels,
),
"status_direction_azimuth": Gauge(
"starlink_status_direction_azimuth",
"",
common_labels,
),
"status_direction_elevation": Gauge(
"starlink_status_direction_elevation",
"",
common_labels,
),
"status_is_snr_above_noise_floor": Gauge(
"starlink_status_is_snr_above_noise_floor",
"",
common_labels,
),
"status_alert_motors_stuck": Gauge(
"starlink_status_alert_motors_stuck",
"",
common_labels,
),
"status_alert_thermal_throttle": Gauge(
"starlink_status_alert_thermal_throttle",
"",
common_labels,
),
"status_alert_thermal_shutdown": Gauge(
"starlink_status_alert_thermal_shutdown",
"",
common_labels,
),
"status_alert_mast_not_near_vertical": Gauge(
"starlink_status_alert_mast_not_near_vertical",
"",
common_labels,
),
"status_alert_unexpected_location": Gauge(
"starlink_status_alert_unexpected_location",
"",
common_labels,
),
"status_alert_slow_ethernet_speeds": Gauge(
"starlink_status_alert_slow_ethernet_speeds",
"",
common_labels,
),
"status_alert_roaming": Gauge(
"starlink_status_alert_roaming",
"",
common_labels,
),
"status_alert_install_pending": Gauge(
"starlink_status_alert_install_pending",
"",
common_labels,
),
"status_alert_is_heating": Gauge(
"starlink_status_alert_is_heating",
"",
common_labels,
),
"status_alert_power_supply_thermal_throttle": Gauge(
"starlink_status_alert_power_supply_thermal_throttle",
"",
common_labels,
),
"status_alert_is_power_save_idle": Gauge(
"starlink_status_alert_is_power_save_idle",
"",
common_labels,
),
"status_alert_moving_fast_while_not_aviation": Gauge(
"starlink_status_alert_moving_fast_while_not_aviation",
"",
common_labels,
),
"status_alert_moving_while_not_mobile": Gauge(
"starlink_status_alert_moving_while_not_mobile",
"",
common_labels,
),
"ping_stats_samples": Gauge(
"starlink_ping_stats_samples",
"",
common_labels,
),
"ping_stats_end_counter": Gauge(
"starlink_ping_stats_end_counter",
"",
common_labels,
),
"usage_download_usage": Gauge(
"starlink_usage_download_usage_bytes",
"",
common_labels,
),
"usage_upload_usage": Gauge(
"starlink_usage_upload_usage_bytes",
"",
common_labels,
),
"status_state": Enum(
"starlink_status_state",
"",
common_labels,
states=[
"UNKNOWN",
"CONNECTED",
"BOOTING",
"SEARCHING",
"STOWED",
"THERMAL_SHUTDOWN",
"NO_SATS",
"OBSTRUCTED",
"NO_DOWNLINK",
"NO_PINGS",
"DISH_UNREACHABLE",
],
),
}
info = Info(
"starlink_info",
"",
# common_labels,
)
unprocessed_metrics = Gauge(
"starlink_unprocessed_metrics",
"",
common_labels + ["metric"],
)
missing_metrics = Gauge(
"starlink_missing_metrics",
"",
common_labels + ["metric"],
)
def parse_args():
parser = dish_common.create_arg_parser(
output_description="Prometheus exporter", bulk_history=False
)
group = parser.add_argument_group(title="HTTP server options")
group.add_argument("--address", default="0.0.0.0", help="IP address to listen on")
group.add_argument("--port", default=8080, type=int, help="Port to listen on")
return dish_common.run_arg_parser(parser, modes=["status", "alert_detail", "usage"])
class GatherMetrics(threading.Thread):
def __init__(self: Self, opts, gstate, *args, **kw):
self.opts = opts
self.gstate = gstate
super().__init__(*args, **kw)
def run(self: Self):
while True:
self.gather()
time.sleep(5.0)
def gather(self: Self) -> None:
raw_data = {}
def data_add_item(name, value, category):
raw_data[category + "_" + name] = value
def data_add_sequencem(name, value, category, start):
raise NotImplementedError(
f"Did not expect sequence data {name!r} {value!r} {category!r} {start!r}"
)
with self.gstate.lock:
rc, status_ts, hist_ts = dish_common.get_data(
self.opts, self.gstate, data_add_item, data_add_sequencem
)
# snr is not supported by starlink any more but still returned by the grpc
# service for backwards compatibility
if "status_snr" in raw_data:
del raw_data["status_snr"]
status_id = raw_data.get("status_id")
info_metrics = [
"status_id",
"status_hardware_version",
"status_software_version",
]
metrics_not_found = []
metrics_not_found.extend([x for x in info_metrics if x not in raw_data])
info.info(
{
x.replace("status_", ""): raw_data.pop(x)
for x in info_metrics
if x in raw_data
}
)
for name, metric_info in METRICS.items():
if name in raw_data:
match metric_info:
case Gauge():
metric_info.labels(id=status_id).set(raw_data.pop(name) or 0)
case Enum():
metric_info.labels(id=status_id).state(raw_data.pop(name) or 0)
case _:
pass
else:
metrics_not_found.append(name)
for name in raw_data:
unprocessed_metrics.labels(id=status_id, metric=name).set(1)
for name in metrics_not_found:
missing_metrics.labels(id=status_id, metric=name).set(1)
def main():
setup_logging()
opts = parse_args()
logging.basicConfig(format="%(levelname)s: %(message)s", stream=sys.stderr)
gstate = dish_common.GlobalState(target=opts.target)
gstate.lock = threading.Lock()
gather = GatherMetrics(opts, gstate)
gather.start()
httpd = ThreadingHTTPServer((opts.address, opts.port), MetricsHandler)
httpd.daemon_threads = False
signal.signal(signal.SIGTERM, handle_sigterm)
logger.info(f"HTTP listening on port {opts.port}")
try:
httpd.serve_forever()
except (KeyboardInterrupt, Terminated):
pass
finally:
httpd.server_close()
gstate.shutdown()
sys.exit()
if __name__ == "__main__":
main()