e16649fbf1
Since "current" got added to the global data group returned from getting the history stats in non-bulk mode, it was being output by all 3 of the history scripts, and the name "current" was a little confusing when looking at prior output, since old values would no longer be current. The description of it in the start param of history_bulk_data was confusing, too.
414 lines
16 KiB
Python
414 lines
16 KiB
Python
#!/usr/bin/python3
|
|
######################################################################
|
|
#
|
|
# Write Starlink user terminal packet loss, latency, and usage data
|
|
# to an InfluxDB database.
|
|
#
|
|
# This script examines the most recent samples from the history data,
|
|
# and either writes them in whole, or computes several different
|
|
# metrics related to packet loss and writes those, to the specified
|
|
# InfluxDB database.
|
|
#
|
|
# NOTE: The Starlink user terminal does not include time values with
|
|
# its history or status data, so this script uses current system time
|
|
# to compute the timestamps it sends to InfluxDB. It is recommended
|
|
# to run this script on a host that has its system clock synced via
|
|
# NTP. Otherwise, the timestamps may get out of sync with real time.
|
|
#
|
|
######################################################################
|
|
|
|
import getopt
|
|
from datetime import datetime
|
|
from datetime import timezone
|
|
import logging
|
|
import os
|
|
import signal
|
|
import sys
|
|
import time
|
|
import warnings
|
|
|
|
from influxdb import InfluxDBClient
|
|
|
|
import starlink_grpc
|
|
|
|
BULK_MEASUREMENT = "spacex.starlink.user_terminal.history"
|
|
PING_MEASUREMENT = "spacex.starlink.user_terminal.ping_stats"
|
|
MAX_QUEUE_LENGTH = 864000
|
|
|
|
|
|
class Terminated(Exception):
|
|
pass
|
|
|
|
|
|
def handle_sigterm(signum, frame):
|
|
# Turn SIGTERM into an exception so main loop can clean up
|
|
raise Terminated()
|
|
|
|
|
|
def main():
|
|
arg_error = False
|
|
|
|
try:
|
|
opts, args = getopt.getopt(sys.argv[1:], "abhkn:p:rs:t:vC:D:IP:R:SU:")
|
|
except getopt.GetoptError as err:
|
|
print(str(err))
|
|
arg_error = True
|
|
|
|
# Default to 1 hour worth of data samples.
|
|
samples_default = 3600
|
|
samples = None
|
|
print_usage = False
|
|
verbose = False
|
|
default_loop_time = 0
|
|
loop_time = default_loop_time
|
|
bulk_mode = False
|
|
bulk_skip_query = False
|
|
run_lengths = False
|
|
host_default = "localhost"
|
|
database_default = "starlinkstats"
|
|
icargs = {"host": host_default, "timeout": 5, "database": database_default}
|
|
rp = None
|
|
flush_limit = 6
|
|
max_batch = 5000
|
|
|
|
# For each of these check they are both set and not empty string
|
|
influxdb_host = os.environ.get("INFLUXDB_HOST")
|
|
if influxdb_host:
|
|
icargs["host"] = influxdb_host
|
|
influxdb_port = os.environ.get("INFLUXDB_PORT")
|
|
if influxdb_port:
|
|
icargs["port"] = int(influxdb_port)
|
|
influxdb_user = os.environ.get("INFLUXDB_USER")
|
|
if influxdb_user:
|
|
icargs["username"] = influxdb_user
|
|
influxdb_pwd = os.environ.get("INFLUXDB_PWD")
|
|
if influxdb_pwd:
|
|
icargs["password"] = influxdb_pwd
|
|
influxdb_db = os.environ.get("INFLUXDB_DB")
|
|
if influxdb_db:
|
|
icargs["database"] = influxdb_db
|
|
influxdb_rp = os.environ.get("INFLUXDB_RP")
|
|
if influxdb_rp:
|
|
rp = influxdb_rp
|
|
influxdb_ssl = os.environ.get("INFLUXDB_SSL")
|
|
if influxdb_ssl:
|
|
icargs["ssl"] = True
|
|
if influxdb_ssl.lower() == "secure":
|
|
icargs["verify_ssl"] = True
|
|
elif influxdb_ssl.lower() == "insecure":
|
|
icargs["verify_ssl"] = False
|
|
else:
|
|
icargs["verify_ssl"] = influxdb_ssl
|
|
|
|
if not arg_error:
|
|
if len(args) > 0:
|
|
arg_error = True
|
|
else:
|
|
for opt, arg in opts:
|
|
if opt == "-a":
|
|
samples = -1
|
|
elif opt == "-b":
|
|
bulk_mode = True
|
|
elif opt == "-h":
|
|
print_usage = True
|
|
elif opt == "-k":
|
|
bulk_skip_query = True
|
|
elif opt == "-n":
|
|
icargs["host"] = arg
|
|
elif opt == "-p":
|
|
icargs["port"] = int(arg)
|
|
elif opt == "-r":
|
|
run_lengths = True
|
|
elif opt == "-s":
|
|
samples = int(arg)
|
|
elif opt == "-t":
|
|
loop_time = float(arg)
|
|
elif opt == "-v":
|
|
verbose = True
|
|
elif opt == "-C":
|
|
icargs["ssl"] = True
|
|
icargs["verify_ssl"] = arg
|
|
elif opt == "-D":
|
|
icargs["database"] = arg
|
|
elif opt == "-I":
|
|
icargs["ssl"] = True
|
|
icargs["verify_ssl"] = False
|
|
elif opt == "-P":
|
|
icargs["password"] = arg
|
|
elif opt == "-R":
|
|
rp = arg
|
|
elif opt == "-S":
|
|
icargs["ssl"] = True
|
|
icargs["verify_ssl"] = True
|
|
elif opt == "-U":
|
|
icargs["username"] = arg
|
|
|
|
if "password" in icargs and "username" not in icargs:
|
|
print("Password authentication requires username to be set")
|
|
arg_error = True
|
|
|
|
if print_usage or arg_error:
|
|
print("Usage: " + sys.argv[0] + " [options...]")
|
|
print("Options:")
|
|
print(" -a: Parse all valid samples")
|
|
print(" -b: Bulk mode: write individual sample data instead of summary stats")
|
|
print(" -h: Be helpful")
|
|
print(" -k: Skip querying for prior sample write point in bulk mode")
|
|
print(" -n <name>: Hostname of InfluxDB server, default: " + host_default)
|
|
print(" -p <num>: Port number to use on InfluxDB server")
|
|
print(" -r: Include ping drop run length stats")
|
|
print(" -s <num>: Number of data samples to parse; in bulk mode, applies to first")
|
|
print(" loop iteration only, default: -1 in bulk mode, loop interval if")
|
|
print(" loop interval set, else " + str(samples_default))
|
|
print(" -t <num>: Loop interval in seconds or 0 for no loop, default: " +
|
|
str(default_loop_time))
|
|
print(" -v: Be verbose")
|
|
print(" -C <filename>: Enable SSL/TLS using specified CA cert to verify server")
|
|
print(" -D <name>: Database name to use, default: " + database_default)
|
|
print(" -I: Enable SSL/TLS but disable certificate verification (INSECURE!)")
|
|
print(" -P <word>: Set password for authentication")
|
|
print(" -R <name>: Retention policy name to use")
|
|
print(" -S: Enable SSL/TLS using default CA cert")
|
|
print(" -U <name>: Set username for authentication")
|
|
sys.exit(1 if arg_error else 0)
|
|
|
|
if samples is None:
|
|
samples = -1 if bulk_mode else int(loop_time) if loop_time > 0 else samples_default
|
|
|
|
logging.basicConfig(format="%(levelname)s: %(message)s")
|
|
|
|
class GlobalState:
|
|
pass
|
|
|
|
gstate = GlobalState()
|
|
gstate.dish_id = None
|
|
gstate.points = []
|
|
gstate.counter = None
|
|
gstate.timestamp = None
|
|
gstate.query_done = bulk_skip_query
|
|
|
|
def conn_error(msg, *args):
|
|
# Connection errors that happen in an interval loop are not critical
|
|
# failures, but are interesting enough to print in non-verbose mode.
|
|
if loop_time > 0:
|
|
print(msg % args)
|
|
else:
|
|
logging.error(msg, *args)
|
|
|
|
def flush_points(client):
|
|
# Don't flush points to server if the counter query failed, since some
|
|
# may be discarded later. Write would probably fail, too, anyway.
|
|
if bulk_mode and not gstate.query_done:
|
|
return 1
|
|
|
|
try:
|
|
while len(gstate.points) > max_batch:
|
|
client.write_points(gstate.points[:max_batch],
|
|
time_precision="s",
|
|
retention_policy=rp)
|
|
if verbose:
|
|
print("Data points written: " + str(max_batch))
|
|
del gstate.points[:max_batch]
|
|
if gstate.points:
|
|
client.write_points(gstate.points, time_precision="s", retention_policy=rp)
|
|
if verbose:
|
|
print("Data points written: " + str(len(gstate.points)))
|
|
gstate.points.clear()
|
|
except Exception as e:
|
|
conn_error("Failed writing to InfluxDB database: %s", str(e))
|
|
# If failures persist, don't just use infinite memory. Max queue
|
|
# is currently 10 days of bulk data, so something is very wrong
|
|
# if it's ever exceeded.
|
|
if len(gstate.points) > MAX_QUEUE_LENGTH:
|
|
logging.error("Max write queue exceeded, discarding data.")
|
|
del gstate.points[:-MAX_QUEUE_LENGTH]
|
|
return 1
|
|
|
|
return 0
|
|
|
|
def query_counter(client, now, len_points):
|
|
try:
|
|
# fetch the latest point where counter field was recorded
|
|
result = client.query("SELECT counter FROM \"{0}\" "
|
|
"WHERE time>={1}s AND time<{2}s AND id=$id "
|
|
"ORDER by time DESC LIMIT 1;".format(
|
|
BULK_MEASUREMENT, now - len_points, now),
|
|
bind_params={"id": gstate.dish_id},
|
|
epoch="s")
|
|
rpoints = list(result.get_points())
|
|
if rpoints:
|
|
counter = rpoints[0].get("counter", None)
|
|
timestamp = rpoints[0].get("time", 0)
|
|
if counter and timestamp:
|
|
return int(counter), int(timestamp)
|
|
except TypeError as e:
|
|
# bind_params was added in influxdb-python v5.2.3. That would be
|
|
# easy enough to work around, but older versions had other problems
|
|
# with query(), so just skip this functionality.
|
|
logging.error(
|
|
"Failed running query, probably due to influxdb-python version too old. "
|
|
"Skipping resumption from prior counter value. Reported error was: %s", str(e))
|
|
|
|
return None, 0
|
|
|
|
def process_bulk_data(client):
|
|
before = time.time()
|
|
|
|
start = gstate.counter
|
|
parse_samples = samples if start is None else -1
|
|
general, bulk = starlink_grpc.history_bulk_data(parse_samples, start=start, verbose=verbose)
|
|
|
|
after = time.time()
|
|
parsed_samples = general["samples"]
|
|
new_counter = general["end_counter"]
|
|
timestamp = gstate.timestamp
|
|
# check this first, so it doesn't report as lost time sync
|
|
if gstate.counter is not None and new_counter != gstate.counter + parsed_samples:
|
|
timestamp = None
|
|
# Allow up to 2 seconds of time drift before forcibly re-syncing, since
|
|
# +/- 1 second can happen just due to scheduler timing.
|
|
if timestamp is not None and not before - 2.0 <= timestamp + parsed_samples <= after + 2.0:
|
|
if verbose:
|
|
print("Lost sample time sync at: " +
|
|
str(datetime.fromtimestamp(timestamp + parsed_samples, tz=timezone.utc)))
|
|
timestamp = None
|
|
if timestamp is None:
|
|
timestamp = int(before)
|
|
if verbose and gstate.query_done:
|
|
print("Establishing new time base: {0} -> {1}".format(
|
|
new_counter, datetime.fromtimestamp(timestamp, tz=timezone.utc)))
|
|
timestamp -= parsed_samples
|
|
|
|
for i in range(parsed_samples):
|
|
timestamp += 1
|
|
gstate.points.append({
|
|
"measurement": BULK_MEASUREMENT,
|
|
"tags": {
|
|
"id": gstate.dish_id
|
|
},
|
|
"time": timestamp,
|
|
"fields": {k: v[i] for k, v in bulk.items() if v[i] is not None},
|
|
})
|
|
|
|
# save off counter value for script restart
|
|
if parsed_samples:
|
|
gstate.points[-1]["fields"]["counter"] = new_counter
|
|
|
|
gstate.counter = new_counter
|
|
gstate.timestamp = timestamp
|
|
|
|
# This is here and not before the points being processed because if the
|
|
# query previously failed, there will be points that were processed in
|
|
# a prior loop. This avoids having to handle that as a special case.
|
|
if not gstate.query_done:
|
|
try:
|
|
db_counter, db_timestamp = query_counter(client, timestamp, len(gstate.points))
|
|
except Exception as e:
|
|
# could be temporary outage, so try again next time
|
|
conn_error("Failed querying InfluxDB for prior count: %s", str(e))
|
|
return
|
|
gstate.query_done = True
|
|
start_counter = new_counter - len(gstate.points)
|
|
if db_counter and start_counter <= db_counter < new_counter:
|
|
del gstate.points[:db_counter - start_counter]
|
|
if before - 2.0 <= db_timestamp + len(gstate.points) <= after + 2.0:
|
|
if verbose:
|
|
print("Using existing time base: {0} -> {1}".format(
|
|
db_counter, datetime.fromtimestamp(db_timestamp, tz=timezone.utc)))
|
|
for point in gstate.points:
|
|
db_timestamp += 1
|
|
point["time"] = db_timestamp
|
|
gstate.timestamp = db_timestamp
|
|
return
|
|
if verbose:
|
|
print("Establishing new time base: {0} -> {1}".format(
|
|
new_counter, datetime.fromtimestamp(timestamp, tz=timezone.utc)))
|
|
|
|
def process_ping_stats():
|
|
timestamp = time.time()
|
|
|
|
general, pd_stats, rl_stats = starlink_grpc.history_ping_stats(samples, verbose)
|
|
|
|
all_stats = general.copy()
|
|
all_stats.update(pd_stats)
|
|
if run_lengths:
|
|
for k, v in rl_stats.items():
|
|
if k.startswith("run_"):
|
|
for i, subv in enumerate(v, start=1):
|
|
all_stats[k + "_" + str(i)] = subv
|
|
else:
|
|
all_stats[k] = v
|
|
|
|
gstate.points.append({
|
|
"measurement": PING_MEASUREMENT,
|
|
"tags": {
|
|
"id": gstate.dish_id
|
|
},
|
|
"time": int(timestamp),
|
|
"fields": all_stats,
|
|
})
|
|
|
|
def loop_body(client):
|
|
if gstate.dish_id is None:
|
|
try:
|
|
gstate.dish_id = starlink_grpc.get_id()
|
|
if verbose:
|
|
print("Using dish ID: " + gstate.dish_id)
|
|
except starlink_grpc.GrpcError as e:
|
|
conn_error("Failure getting dish ID: %s", str(e))
|
|
return 1
|
|
|
|
if bulk_mode:
|
|
try:
|
|
process_bulk_data(client)
|
|
except starlink_grpc.GrpcError as e:
|
|
conn_error("Failure getting history: %s", str(e))
|
|
return 1
|
|
else:
|
|
try:
|
|
process_ping_stats()
|
|
except starlink_grpc.GrpcError as e:
|
|
conn_error("Failure getting ping stats: %s", str(e))
|
|
return 1
|
|
|
|
if verbose:
|
|
print("Data points queued: " + str(len(gstate.points)))
|
|
|
|
if len(gstate.points) >= flush_limit:
|
|
return flush_points(client)
|
|
|
|
return 0
|
|
|
|
if "verify_ssl" in icargs and not icargs["verify_ssl"]:
|
|
# user has explicitly said be insecure, so don't warn about it
|
|
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
|
|
|
|
signal.signal(signal.SIGTERM, handle_sigterm)
|
|
try:
|
|
# attempt to hack around breakage between influxdb-python client and 2.0 server:
|
|
influx_client = InfluxDBClient(**icargs, headers={"Accept": "application/json"})
|
|
except TypeError:
|
|
# ...unless influxdb-python package version is too old
|
|
influx_client = InfluxDBClient(**icargs)
|
|
try:
|
|
next_loop = time.monotonic()
|
|
while True:
|
|
rc = loop_body(influx_client)
|
|
if loop_time > 0:
|
|
now = time.monotonic()
|
|
next_loop = max(next_loop + loop_time, now)
|
|
time.sleep(next_loop - now)
|
|
else:
|
|
break
|
|
except Terminated:
|
|
pass
|
|
finally:
|
|
if gstate.points:
|
|
rc = flush_points(influx_client)
|
|
influx_client.close()
|
|
|
|
sys.exit(rc)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|