Add latency and usage history stat groups

Add latency and usage stat groups to the stats computed from history samples. This includes an attempt at characterizing latency under network load, too, but I don't know how useful that's going to be, so I have marked that as experimental, in case it needs algorithmic improvements.

The new groups are enabled on the command line by use of the new mode names: ping_latency, ping_loaded_latency, and usage.

Add valid_s to the obstruction details status group. This was the only missing field from everything available in the status response (other than wedge_fraction_obstructed, which seems redundant to wedge_abs_fraction_obstructed), and I only skipped it because I don't know what it means exactly. Adding it now with my best guess at a description in order to avoid a compatibility breaking change later.

Closes #5
This commit is contained in:
sparky8512 2021-02-01 19:09:34 -08:00
parent 27a98f936b
commit 94114bfd59
3 changed files with 267 additions and 32 deletions

View file

@ -23,7 +23,9 @@ BRACKETS_RE = re.compile(r"([^[]*)(\[((\d+),|)(\d*)\]|)$")
SAMPLES_DEFAULT = 3600 SAMPLES_DEFAULT = 3600
LOOP_TIME_DEFAULT = 0 LOOP_TIME_DEFAULT = 0
STATUS_MODES = ["status", "obstruction_detail", "alert_detail"] STATUS_MODES = ["status", "obstruction_detail", "alert_detail"]
PING_MODES = ["ping_drop", "ping_run_length"] HISTORY_STATS_MODES = [
"ping_drop", "ping_run_length", "ping_latency", "ping_loaded_latency", "usage"
]
UNGROUPED_MODES = [] UNGROUPED_MODES = []
@ -37,7 +39,7 @@ def create_arg_parser(output_description, bulk_history=True):
fromfile_prefix_chars="@", fromfile_prefix_chars="@",
add_help=False) add_help=False)
all_modes = STATUS_MODES + PING_MODES + UNGROUPED_MODES all_modes = STATUS_MODES + HISTORY_STATS_MODES + UNGROUPED_MODES
if bulk_history: if bulk_history:
all_modes.append("bulk_history") all_modes.append("bulk_history")
parser.add_argument("mode", parser.add_argument("mode",
@ -93,7 +95,7 @@ def run_arg_parser(parser, need_id=False, no_stdout_errors=False):
# for convenience, set flags for whether any mode in a group is selected # for convenience, set flags for whether any mode in a group is selected
opts.satus_mode = bool(set(STATUS_MODES).intersection(opts.mode)) opts.satus_mode = bool(set(STATUS_MODES).intersection(opts.mode))
opts.ping_mode = bool(set(PING_MODES).intersection(opts.mode)) opts.history_stats_mode = bool(set(HISTORY_STATS_MODES).intersection(opts.mode))
opts.bulk_mode = "bulk_history" in opts.mode opts.bulk_mode = "bulk_history" in opts.mode
if opts.samples is None: if opts.samples is None:
@ -163,8 +165,8 @@ def get_data(opts, gstate, add_item, add_sequence, add_bulk=None):
if opts.satus_mode: if opts.satus_mode:
try: try:
status_data, obstruct_detail, alert_detail = starlink_grpc.status_data( groups = starlink_grpc.status_data(context=gstate.context)
context=gstate.context) status_data, obstruct_detail, alert_detail = groups[0:3]
except starlink_grpc.GrpcError as e: except starlink_grpc.GrpcError as e:
if "status" in opts.mode: if "status" in opts.mode:
if opts.need_id and gstate.dish_id is None: if opts.need_id and gstate.dish_id is None:
@ -194,11 +196,10 @@ def get_data(opts, gstate, add_item, add_sequence, add_bulk=None):
if opts.verbose: if opts.verbose:
print("Using dish ID: " + gstate.dish_id) print("Using dish ID: " + gstate.dish_id)
if opts.ping_mode: if opts.history_stats_mode:
try: try:
general, ping, runlen = starlink_grpc.history_ping_stats(opts.samples, groups = starlink_grpc.history_stats(opts.samples, opts.verbose, context=gstate.context)
opts.verbose, general, ping, runlen, latency, loaded, usage = groups[0:6]
context=gstate.context)
except starlink_grpc.GrpcError as e: except starlink_grpc.GrpcError as e:
conn_error(opts, "Failure getting ping stats: %s", str(e)) conn_error(opts, "Failure getting ping stats: %s", str(e))
return 1 return 1
@ -207,6 +208,12 @@ def get_data(opts, gstate, add_item, add_sequence, add_bulk=None):
add_data(ping, "ping_stats") add_data(ping, "ping_stats")
if "ping_run_length" in opts.mode: if "ping_run_length" in opts.mode:
add_data(runlen, "ping_stats") add_data(runlen, "ping_stats")
if "ping_latency" in opts.mode:
add_data(latency, "ping_stats")
if "ping_loaded_latency" in opts.mode:
add_data(loaded, "ping_stats")
if "usage" in opts.mode:
add_data(usage, "usage")
if opts.bulk_mode and add_bulk: if opts.bulk_mode and add_bulk:
before = time.time() before = time.time()

View file

@ -36,6 +36,19 @@ VERBOSE_FIELD_MAP = {
"final_run_fragment": "Final drop run fragment", "final_run_fragment": "Final drop run fragment",
"run_seconds": "Per-second drop runs", "run_seconds": "Per-second drop runs",
"run_minutes": "Per-minute drop runs", "run_minutes": "Per-minute drop runs",
# ping_latency fields
"mean_all_ping_latency": "Mean RTT, drop < 1",
"deciles_all_ping_latency": "RTT deciles, drop < 1",
"mean_full_ping_latency": "Mean RTT, drop == 0",
"deciles_full_ping_latency": "RTT deciles, drop == 0",
"stdev_full_ping_latency": "RTT standard deviation, drop == 0",
# ping_loaded_latency is still experimental, so leave those unexplained
# usage fields
"download_usage": "Bytes downloaded",
"upload_usage": "Bytes uploaded",
} }
@ -85,13 +98,20 @@ def print_header(opts):
header_add(general) header_add(general)
header_add(bulk) header_add(bulk)
if opts.ping_mode: if opts.history_stats_mode:
general, ping, runlen = starlink_grpc.history_ping_field_names() groups = starlink_grpc.history_stats_field_names()
general, ping, runlen, latency, loaded, usage = groups[0:6]
header_add(general) header_add(general)
if "ping_drop" in opts.mode: if "ping_drop" in opts.mode:
header_add(ping) header_add(ping)
if "ping_run_length" in opts.mode: if "ping_run_length" in opts.mode:
header_add(runlen) header_add(runlen)
if "ping_loaded_latency" in opts.mode:
header_add(loaded)
if "ping_latency" in opts.mode:
header_add(latency)
if "usage" in opts.mode:
header_add(usage)
print(",".join(header)) print(",".join(header))

View file

@ -66,8 +66,8 @@ This group holds information about the current state of the user terminal.
Obstruction detail status data Obstruction detail status data
------------------------------ ------------------------------
This group holds a single field, with more detail on the specific areas the This group holds additional detail regarding the specific areas the user
user terminal has determined to be obstructed. terminal has determined to be obstructed.
: **wedges_fraction_obstructed** : A 12 element sequence. Each element : **wedges_fraction_obstructed** : A 12 element sequence. Each element
represents a 30 degree wedge of area and its value indicates the fraction represents a 30 degree wedge of area and its value indicates the fraction
@ -78,6 +78,9 @@ user terminal has determined to be obstructed.
the sequence represents the wedge that spans exactly North to 30 degrees the sequence represents the wedge that spans exactly North to 30 degrees
East of North, and subsequent wedges rotate 30 degrees further in the same East of North, and subsequent wedges rotate 30 degrees further in the same
direction. (It's not clear if this will hold true at all latitudes.) direction. (It's not clear if this will hold true at all latitudes.)
: **valid_s** : It is unclear what this field means exactly, but it appears to
be a measure of how complete the data is that the user terminal uses to
determine obstruction locations.
See also *fraction_obstructed* in general status data, which should equal the See also *fraction_obstructed* in general status data, which should equal the
sum of all *wedges_fraction_obstructed* elements. sum of all *wedges_fraction_obstructed* elements.
@ -110,9 +113,9 @@ The sample interval is currently 1 second.
: **samples** : The number of samples analyzed (for statistics) or returned : **samples** : The number of samples analyzed (for statistics) or returned
(for bulk data). (for bulk data).
: **end_counter** : The total number of data samples that have been written to : **end_counter** : The total number of data samples that have been written to
the history buffer since dish reboot, irrespective of buffer wrap. This the history buffer since reboot of the user terminal, irrespective of
can be used to keep track of how many samples are new in comparison to a buffer wrap. This can be used to keep track of how many samples are new
prior query of the history data. in comparison to a prior query of the history data.
Bulk history data Bulk history data
----------------- -----------------
@ -133,9 +136,10 @@ representing the value over time, ending at the current time.
: **scheduled** : Boolean indicating whether or not a satellite was scheduled : **scheduled** : Boolean indicating whether or not a satellite was scheduled
to be available for transmit/receive during the sample period. When to be available for transmit/receive during the sample period. When
false, ping drop shows as "No satellites" in Starlink app. false, ping drop shows as "No satellites" in Starlink app.
: **obstructed** : Boolean indicating whether or not the dish determined the : **obstructed** : Boolean indicating whether or not the user terminal
signal between it and the satellite was obstructed during the sample determined the signal between it and the satellite was obstructed during
period. When true, ping drop shows as "Obstructed" in the Starlink app. the sample period. When true, ping drop shows as "Obstructed" in the
Starlink app.
There is no specific data field in the raw history data that directly There is no specific data field in the raw history data that directly
correlates with "Other" or "Beta downtime" in the Starlink app (or whatever it correlates with "Other" or "Beta downtime" in the Starlink app (or whatever it
@ -208,9 +212,88 @@ of stats, even if they happen at the beginning or end of a run of 100% ping
drop samples. To compute the amount of time that experienced ping loss in less drop samples. To compute the amount of time that experienced ping loss in less
than a single run of 100% ping drop, use (*total_ping_drop* - than a single run of 100% ping drop, use (*total_ping_drop* -
*count_full_ping_drop*) from the ping drop stats. *count_full_ping_drop*) from the ping drop stats.
Ping latency history statistics
-------------------------------
This group of statistics characterizes latency of ping request/response in
various ways. For all non-sequence fields and most sequence elements, the
value may report as None to indicate no matching samples. The exception is
*load_bucket_samples* elements, which report 0 for no matching samples.
The fields that have "all" in their name are computed across all samples that
had any ping success (ping drop < 1). The fields that have "full" in their
name are computed across only the samples that have 100% ping success (ping
drop = 0). Which one is more interesting may depend on intended use. High rate
of packet loss appears to cause outlier latency values on the high side. On
the one hand, those are real cases, so should not be dismissed lightly. On the
other hand, the "full" numbers are more directly comparable to sample sets
taken over time.
: **mean_all_ping_latency** : Weighted mean latency value, in milliseconds, of
all samples that experienced less than 100% ping drop. Values are weighted
by amount of ping success (1 - ping drop).
: **deciles_all_ping_latency** : An 11 element sequence recording the weighted
deciles (10-quantiles) of latency values, in milliseconds, for all samples
that experienced less that 100% ping drop, including the minimum and
maximum values as the 0th and 10th deciles respectively. The 5th decile
(at sequence index 5) is the weighted median latency value.
: **mean_full_ping_latency** : Mean latency value, in milliseconds, of samples
that experienced no ping drop.
: **deciles_full_ping_latency** : An 11 element sequence recording the deciles
(10-quantiles) of latency values, in milliseconds, for all samples that
experienced no ping drop, including the minimum and maximum values as the
0th and 10th deciles respectively. The 5th decile (at sequence index 5) is
the median latency value.
: **stdev_full_ping_latency** : Population standard deviation of the latency
value of samples that experienced no ping drop.
Loaded ping latency statistics
------------------------------
This group of statistics attempts to characterize latency of ping
request/response under various network load conditions. Samples are grouped by
total (down+up) bandwidth used during the sample period, using a log base 2
scale. These groups are referred to as "load buckets" below. The first bucket
in each sequence represents samples that use less than 1Mbps (millions of bits
per second). Subsequent buckets use more bandwidth than that covered by prior
buckets, but less than twice the maximum bandwidth of the immediately prior
bucket. The last bucket, at sequence index 14, represents all samples not
covered by a prior bucket, which works out to any sample using 8192Mbps or
greater. Only samples that experience no ping drop are included in any of the
buckets.
This group of fields should be considered EXPERIMENTAL and thus subject to
change without regard to backward compatibility.
Note that in all cases, the latency values are of "ping" traffic, which may be
prioritized lower than other traffic by various network layers. How much
bandwidth constitutes a fully loaded network connection may vary over time.
Buckets with few samples may not contain statistically significant latency
data.
: **load_bucket_samples** : A 15 element sequence recording the number of
samples per load bucket. See above for load bucket partitioning.
EXPERIMENTAL.
: **load_bucket_min_latency** : A 15 element sequence recording the minimum
latency value, in milliseconds, per load bucket. EXPERIMENTAL.
: **load_bucket_median_latency** : A 15 element sequence recording the median
latency value, in milliseconds, per load bucket. EXPERIMENTAL.
: **load_bucket_max_latency** : A 15 element sequence recording the maximum
latency value, in milliseconds, per load bucket. EXPERIMENTAL.
Bandwidth usage history statistics
----------------------------------
This group of statistics characterizes total bandwidth usage over the sample
period.
: **download_usage** : Total number of bytes downloaded to the user terminal
during the sample period.
: **upload_usage** : Total number of bytes uploaded from the user terminal
during the sample period.
""" """
from itertools import chain from itertools import chain
import math
import statistics
import grpc import grpc
@ -233,7 +316,11 @@ class GrpcError(Exception):
class ChannelContext: class ChannelContext:
"""A wrapper for reusing an open grpc Channel across calls.""" """A wrapper for reusing an open grpc Channel across calls.
`close()` should be called on the object when it is no longer
in use.
"""
def __init__(self, target="192.168.100.1:9200"): def __init__(self, target="192.168.100.1:9200"):
self.channel = None self.channel = None
self.target = target self.target = target
@ -258,9 +345,9 @@ def status_field_names():
See module level docs regarding brackets in field names. See module level docs regarding brackets in field names.
Returns: Returns:
A tuple with 3 lists, the first with status data field names, the A tuple with 3 lists, with status data field names, alert detail field
second with obstruction detail field names, and the third with alert names, and obstruction detail field names to their respective values,
detail field names. in that order.
""" """
alert_names = [] alert_names = []
for field in spacex.api.device.dish_pb2.DishAlerts.DESCRIPTOR.fields: for field in spacex.api.device.dish_pb2.DishAlerts.DESCRIPTOR.fields:
@ -284,6 +371,7 @@ def status_field_names():
"seconds_obstructed", "seconds_obstructed",
], [ ], [
"wedges_fraction_obstructed[12]", "wedges_fraction_obstructed[12]",
"valid_s",
], alert_names ], alert_names
@ -346,9 +434,9 @@ def status_data(context=None):
across repeated calls. across repeated calls.
Returns: Returns:
A tuple with 3 dicts, the first mapping status data names to their A tuple with 3 dicts, mapping status data field names, alert detail
values, the second mapping alert detail field names to their values, field names, and obstruction detail field names to their respective
and the third mapping obstruction detail field names to their values. values, in that order.
Raises: Raises:
GrpcError: Failed getting history info from the Starlink user GrpcError: Failed getting history info from the Starlink user
@ -387,6 +475,7 @@ def status_data(context=None):
"seconds_obstructed": status.obstruction_stats.last_24h_obstructed_s, "seconds_obstructed": status.obstruction_stats.last_24h_obstructed_s,
}, { }, {
"wedges_fraction_obstructed[]": status.obstruction_stats.wedge_abs_fraction_obstructed, "wedges_fraction_obstructed[]": status.obstruction_stats.wedge_abs_fraction_obstructed,
"valid_s": status.obstruction_stats.valid_s,
}, alerts }, alerts
@ -415,15 +504,25 @@ def history_bulk_field_names():
def history_ping_field_names(): def history_ping_field_names():
"""Deprecated. Use history_stats_field_names instead."""
return history_stats_field_names()[0:3]
def history_stats_field_names():
"""Return the field names of the packet loss stats. """Return the field names of the packet loss stats.
Note: Note:
See module level docs regarding brackets in field names. See module level docs regarding brackets in field names.
Returns: Returns:
A tuple with 3 lists, the first with general data names, the second A tuple with 6 lists, with general data names, ping drop stat names,
with ping drop stat names, and the third with ping drop run length ping drop run length stat names, ping latency stat names, loaded ping
stat names. latency stat names, and bandwidth usage stat names, in that order.
Note:
Additional lists may be added to this tuple in the future with
additional data groups, so it not recommended for the caller to
assume exactly 6 elements.
""" """
return [ return [
"samples", "samples",
@ -442,6 +541,20 @@ def history_ping_field_names():
"final_run_fragment", "final_run_fragment",
"run_seconds[1,61]", "run_seconds[1,61]",
"run_minutes[1,61]", "run_minutes[1,61]",
], [
"mean_all_ping_latency",
"deciles_all_ping_latency[11]",
"mean_full_ping_latency",
"deciles_full_ping_latency[11]",
"stdev_full_ping_latency",
], [
"load_bucket_samples[15]",
"load_bucket_min_latency[15]",
"load_bucket_median_latency[15]",
"load_bucket_max_latency[15]",
], [
"download_usage",
"upload_usage",
] ]
@ -592,6 +705,11 @@ def history_bulk_data(parse_samples, start=None, verbose=False, context=None):
def history_ping_stats(parse_samples, verbose=False, context=None): def history_ping_stats(parse_samples, verbose=False, context=None):
"""Deprecated. Use history_stats instead."""
return history_stats(parse_samples, verbose=verbose, context=context)[0:3]
def history_stats(parse_samples, verbose=False, context=None):
"""Fetch, parse, and compute the packet loss stats. """Fetch, parse, and compute the packet loss stats.
Note: Note:
@ -605,9 +723,15 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
across repeated calls. across repeated calls.
Returns: Returns:
A tuple with 3 dicts, the first mapping general data names to their A tuple with 6 dicts, mapping general data names, ping drop stat
values, the second mapping ping drop stat names to their values and names, ping drop run length stat names, ping latency stat names,
the third mapping ping drop run length stat names to their values. loaded ping latency stat names, and bandwidth usage stat names to
their respective values, in that order.
Note:
Additional dicts may be added to this tuple in the future with
additional data groups, so it not recommended for the caller to
assume exactly 6 elements.
Raises: Raises:
GrpcError: Failed getting history info from the Starlink user GrpcError: Failed getting history info from the Starlink user
@ -636,6 +760,13 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
run_length = 0 run_length = 0
init_run_length = None init_run_length = None
usage_down = 0.0
usage_up = 0.0
rtt_full = []
rtt_all = []
rtt_buckets = [[] for _ in range(15)]
for i in sample_range: for i in sample_range:
d = history.pop_ping_drop_rate[i] d = history.pop_ping_drop_rate[i]
if d >= 1: if d >= 1:
@ -669,6 +800,22 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
count_full_obstruct += 1 count_full_obstruct += 1
tot += d tot += d
down = history.downlink_throughput_bps[i]
usage_down += down
up = history.uplink_throughput_bps[i]
usage_up += up
rtt = history.pop_ping_latency_ms[i]
# note that "full" here means the opposite of ping drop full
if d == 0.0:
rtt_full.append(rtt)
if down + up > 500000:
rtt_buckets[min(14, int(math.log2((down+up) / 500000)))].append(rtt)
else:
rtt_buckets[0].append(rtt)
if d < 1.0:
rtt_all.append((rtt, 1.0 - d))
# If the entire sample set is one big drop run, it will be both initial # If the entire sample set is one big drop run, it will be both initial
# fragment (continued from prior sample range) and final one (continued # fragment (continued from prior sample range) and final one (continued
# to next sample range), but to avoid double-reporting, just call it # to next sample range), but to avoid double-reporting, just call it
@ -677,6 +824,53 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
init_run_length = run_length init_run_length = run_length
run_length = 0 run_length = 0
def weighted_mean_and_quantiles(data, n):
if not data:
return None, [None] * (n+1)
total_weight = sum(x[1] for x in data)
result = []
items = iter(data)
value, accum_weight = next(items)
accum_value = value * accum_weight
for boundary in (total_weight * x / n for x in range(n)):
while accum_weight < boundary:
try:
value, weight = next(items)
accum_value += value * weight
accum_weight += weight
except StopIteration:
# shouldn't happen, but in case of float precision weirdness...
break
result.append(value)
result.append(data[-1][0])
accum_value += sum(x[0] for x in items)
return accum_value / total_weight, result
bucket_samples = []
bucket_min = []
bucket_median = []
bucket_max = []
for bucket in rtt_buckets:
if bucket:
bucket_samples.append(len(bucket))
bucket_min.append(min(bucket))
bucket_median.append(statistics.median(bucket))
bucket_max.append(max(bucket))
else:
bucket_samples.append(0)
bucket_min.append(None)
bucket_median.append(None)
bucket_max.append(None)
rtt_all.sort(key=lambda x: x[0])
wmean_all, wdeciles_all = weighted_mean_and_quantiles(rtt_all, 10)
if rtt_full:
deciles_full = [min(rtt_full)]
deciles_full.extend(statistics.quantiles(rtt_full, n=10, method="inclusive"))
deciles_full.append(max(rtt_full))
else:
deciles_full = [None] * 11
return { return {
"samples": parse_samples, "samples": parse_samples,
"end_counter": current, "end_counter": current,
@ -694,4 +888,18 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
"final_run_fragment": run_length, "final_run_fragment": run_length,
"run_seconds[1,]": second_runs, "run_seconds[1,]": second_runs,
"run_minutes[1,]": minute_runs, "run_minutes[1,]": minute_runs,
}, {
"mean_all_ping_latency": wmean_all,
"deciles_all_ping_latency[]": wdeciles_all,
"mean_full_ping_latency": statistics.fmean(rtt_full) if rtt_full else None,
"deciles_full_ping_latency[]": deciles_full,
"stdev_full_ping_latency": statistics.pstdev(rtt_full) if rtt_full else None,
}, {
"load_bucket_samples[]": bucket_samples,
"load_bucket_min_latency[]": bucket_min,
"load_bucket_median_latency[]": bucket_median,
"load_bucket_max_latency[]": bucket_max,
}, {
"download_usage": int(round(usage_down / 8)),
"upload_usage": int(round(usage_up / 8)),
} }