Add latency and usage history stat groups

Add latency and usage stat groups to the stats computed from history samples. This includes an attempt at characterizing latency under network load, too, but I don't know how useful that's going to be, so I have marked that as experimental, in case it needs algorithmic improvements. The new groups are enabled on the command line by use of the new mode names: ping_latency, ping_loaded_latency, and usage. Add valid_s to the obstruction details status group. This was the only missing field from everything available in the status response (other than wedge_fraction_obstructed, which seems redundant to wedge_abs_fraction_obstructed), and I only skipped it because I don't know what it means exactly. Adding it now with my best guess at a description in order to avoid a compatibility breaking change later. Closes #5
2021-02-01 19:09:34 -08:00 · 2021-02-01 19:09:34 -08:00 · 94114bfd59
commit 94114bfd59
parent 27a98f936b
3 changed files with 267 additions and 32 deletions
--- a/dish_common.py
+++ b/dish_common.py
@ -23,7 +23,9 @@ BRACKETS_RE = re.compile(r"([^[]*)(\[((\d+),|)(\d*)\]|)$")
 SAMPLES_DEFAULT = 3600
 LOOP_TIME_DEFAULT = 0
 STATUS_MODES = ["status", "obstruction_detail", "alert_detail"]
-PING_MODES = ["ping_drop", "ping_run_length"]
+HISTORY_STATS_MODES = [
    "ping_drop", "ping_run_length", "ping_latency", "ping_loaded_latency", "usage"
 ]
 UNGROUPED_MODES = []
@ -37,7 +39,7 @@ def create_arg_parser(output_description, bulk_history=True):
        fromfile_prefix_chars="@",
        add_help=False)
-    all_modes = STATUS_MODES + PING_MODES + UNGROUPED_MODES
+    all_modes = STATUS_MODES + HISTORY_STATS_MODES + UNGROUPED_MODES
    if bulk_history:
        all_modes.append("bulk_history")
    parser.add_argument("mode",
@ -93,7 +95,7 @@ def run_arg_parser(parser, need_id=False, no_stdout_errors=False):
    # for convenience, set flags for whether any mode in a group is selected
    opts.satus_mode = bool(set(STATUS_MODES).intersection(opts.mode))
-    opts.ping_mode = bool(set(PING_MODES).intersection(opts.mode))
+    opts.history_stats_mode = bool(set(HISTORY_STATS_MODES).intersection(opts.mode))
    opts.bulk_mode = "bulk_history" in opts.mode
    if opts.samples is None:
@ -163,8 +165,8 @@ def get_data(opts, gstate, add_item, add_sequence, add_bulk=None):
    if opts.satus_mode:
        try:
-            status_data, obstruct_detail, alert_detail = starlink_grpc.status_data(
+            groups = starlink_grpc.status_data(context=gstate.context)
-                context=gstate.context)
+            status_data, obstruct_detail, alert_detail = groups[0:3]
        except starlink_grpc.GrpcError as e:
            if "status" in opts.mode:
                if opts.need_id and gstate.dish_id is None:
@ -194,11 +196,10 @@ def get_data(opts, gstate, add_item, add_sequence, add_bulk=None):
        if opts.verbose:
            print("Using dish ID: " + gstate.dish_id)
-    if opts.ping_mode:
+    if opts.history_stats_mode:
        try:
-            general, ping, runlen = starlink_grpc.history_ping_stats(opts.samples,
+            groups = starlink_grpc.history_stats(opts.samples, opts.verbose, context=gstate.context)
-                                                                     opts.verbose,
+            general, ping, runlen, latency, loaded, usage = groups[0:6]
                                                                     context=gstate.context)
        except starlink_grpc.GrpcError as e:
            conn_error(opts, "Failure getting ping stats: %s", str(e))
            return 1
@ -207,6 +208,12 @@ def get_data(opts, gstate, add_item, add_sequence, add_bulk=None):
            add_data(ping, "ping_stats")
        if "ping_run_length" in opts.mode:
            add_data(runlen, "ping_stats")
        if "ping_latency" in opts.mode:
            add_data(latency, "ping_stats")
        if "ping_loaded_latency" in opts.mode:
            add_data(loaded, "ping_stats")
        if "usage" in opts.mode:
            add_data(usage, "usage")
    if opts.bulk_mode and add_bulk:
        before = time.time()
--- a/dish_grpc_text.py
+++ b/dish_grpc_text.py
@ -36,6 +36,19 @@ VERBOSE_FIELD_MAP = {
    "final_run_fragment": "Final drop run fragment",
    "run_seconds": "Per-second drop runs",
    "run_minutes": "Per-minute drop runs",
    # ping_latency fields
    "mean_all_ping_latency": "Mean RTT, drop < 1",
    "deciles_all_ping_latency": "RTT deciles, drop < 1",
    "mean_full_ping_latency": "Mean RTT, drop == 0",
    "deciles_full_ping_latency": "RTT deciles, drop == 0",
    "stdev_full_ping_latency": "RTT standard deviation, drop == 0",
    # ping_loaded_latency is still experimental, so leave those unexplained
    # usage fields
    "download_usage": "Bytes downloaded",
    "upload_usage": "Bytes uploaded",
 }
@ -85,13 +98,20 @@ def print_header(opts):
        header_add(general)
        header_add(bulk)
-    if opts.ping_mode:
+    if opts.history_stats_mode:
-        general, ping, runlen = starlink_grpc.history_ping_field_names()
+        groups = starlink_grpc.history_stats_field_names()
        general, ping, runlen, latency, loaded, usage = groups[0:6]
        header_add(general)
        if "ping_drop" in opts.mode:
            header_add(ping)
        if "ping_run_length" in opts.mode:
            header_add(runlen)
        if "ping_loaded_latency" in opts.mode:
            header_add(loaded)
        if "ping_latency" in opts.mode:
            header_add(latency)
        if "usage" in opts.mode:
            header_add(usage)
    print(",".join(header))
--- a/starlink_grpc.py
+++ b/starlink_grpc.py
@ -66,8 +66,8 @@ This group holds information about the current state of the user terminal.
 Obstruction detail status data
 ------------------------------
-This group holds a single field, with more detail on the specific areas the
+This group holds additional detail regarding the specific areas the user
-user terminal has determined to be obstructed.
+terminal has determined to be obstructed.
 : **wedges_fraction_obstructed** : A 12 element sequence. Each element
    represents a 30 degree wedge of area and its value indicates the fraction
@ -78,6 +78,9 @@ user terminal has determined to be obstructed.
    the sequence represents the wedge that spans exactly North to 30 degrees
    East of North, and subsequent wedges rotate 30 degrees further in the same
    direction. (It's not clear if this will hold true at all latitudes.)
 : **valid_s** : It is unclear what this field means exactly, but it appears to
    be a measure of how complete the data is that the user terminal uses to
    determine obstruction locations.
 See also *fraction_obstructed* in general status data, which should equal the
 sum of all *wedges_fraction_obstructed* elements.
@ -110,9 +113,9 @@ The sample interval is currently 1 second.
 : **samples** : The number of samples analyzed (for statistics) or returned
    (for bulk data).
 : **end_counter** : The total number of data samples that have been written to
-    the history buffer since dish reboot, irrespective of buffer wrap.  This
+    the history buffer since reboot of the user terminal, irrespective of
-    can be used to keep track of how many samples are new in comparison to a
+    buffer wrap.  This can be used to keep track of how many samples are new
-    prior query of the history data.
+    in comparison to a prior query of the history data.
 Bulk history data
 -----------------
@ -133,9 +136,10 @@ representing the value over time, ending at the current time.
 : **scheduled** : Boolean indicating whether or not a satellite was scheduled
    to be available for transmit/receive during the sample period.  When
    false, ping drop shows as "No satellites" in Starlink app.
-: **obstructed** : Boolean indicating whether or not the dish determined the
+: **obstructed** : Boolean indicating whether or not the user terminal
-    signal between it and the satellite was obstructed during the sample
+    determined the signal between it and the satellite was obstructed during
-    period. When true, ping drop shows as "Obstructed" in the Starlink app.
+    the sample period. When true, ping drop shows as "Obstructed" in the
    Starlink app.
 There is no specific data field in the raw history data that directly
 correlates with "Other" or "Beta downtime" in the Starlink app (or whatever it
@ -208,9 +212,88 @@ of stats, even if they happen at the beginning or end of a run of 100% ping
 drop samples. To compute the amount of time that experienced ping loss in less
 than a single run of 100% ping drop, use (*total_ping_drop* -
 *count_full_ping_drop*) from the ping drop stats.
 Ping latency history statistics
 -------------------------------
 This group of statistics characterizes latency of ping request/response in
 various ways. For all non-sequence fields and most sequence elements, the
 value may report as None to indicate no matching samples. The exception is
 *load_bucket_samples* elements, which report 0 for no matching samples.
 The fields that have "all" in their name are computed across all samples that
 had any ping success (ping drop < 1). The fields that have "full" in their
 name are computed across only the samples that have 100% ping success (ping
 drop = 0). Which one is more interesting may depend on intended use. High rate
 of packet loss appears to cause outlier latency values on the high side. On
 the one hand, those are real cases, so should not be dismissed lightly. On the
 other hand, the "full" numbers are more directly comparable to sample sets
 taken over time.
 : **mean_all_ping_latency** : Weighted mean latency value, in milliseconds, of
    all samples that experienced less than 100% ping drop. Values are weighted
    by amount of ping success (1 - ping drop).
 : **deciles_all_ping_latency** : An 11 element sequence recording the weighted
    deciles (10-quantiles) of latency values, in milliseconds, for all samples
    that experienced less that 100% ping drop, including the minimum and
    maximum values as the 0th and 10th deciles respectively. The 5th decile
    (at sequence index 5) is the weighted median latency value.
 : **mean_full_ping_latency** : Mean latency value, in milliseconds, of samples
    that experienced no ping drop.
 : **deciles_full_ping_latency** : An 11 element sequence recording the deciles
    (10-quantiles) of latency values, in milliseconds, for all samples that
    experienced no ping drop, including the minimum and maximum values as the
    0th and 10th deciles respectively. The 5th decile (at sequence index 5) is
    the median latency value.
 : **stdev_full_ping_latency** : Population standard deviation of the latency
    value of samples that experienced no ping drop.
 Loaded ping latency statistics
 ------------------------------
 This group of statistics attempts to characterize latency of ping
 request/response under various network load conditions. Samples are grouped by
 total (down+up) bandwidth used during the sample period, using a log base 2
 scale. These groups are referred to as "load buckets" below. The first bucket
 in each sequence represents samples that use less than 1Mbps (millions of bits
 per second). Subsequent buckets use more bandwidth than that covered by prior
 buckets, but less than twice the maximum bandwidth of the immediately prior
 bucket. The last bucket, at sequence index 14, represents all samples not
 covered by a prior bucket, which works out to any sample using 8192Mbps or
 greater. Only samples that experience no ping drop are included in any of the
 buckets.
 This group of fields should be considered EXPERIMENTAL and thus subject to
 change without regard to backward compatibility.
 Note that in all cases, the latency values are of "ping" traffic, which may be
 prioritized lower than other traffic by various network layers. How much
 bandwidth constitutes a fully loaded network connection may vary over time.
 Buckets with few samples may not contain statistically significant latency
 data.
 : **load_bucket_samples** : A 15 element sequence recording the number of
    samples per load bucket. See above for load bucket partitioning.
    EXPERIMENTAL.
 : **load_bucket_min_latency** : A 15 element sequence recording the minimum
    latency value, in milliseconds, per load bucket. EXPERIMENTAL.
 : **load_bucket_median_latency** : A 15 element sequence recording the median
    latency value, in milliseconds, per load bucket. EXPERIMENTAL.
 : **load_bucket_max_latency** : A 15 element sequence recording the maximum
    latency value, in milliseconds, per load bucket. EXPERIMENTAL.
 Bandwidth usage history statistics
 ----------------------------------
 This group of statistics characterizes total bandwidth usage over the sample
 period.
 : **download_usage** : Total number of bytes downloaded to the user terminal
    during the sample period.
 : **upload_usage** : Total number of bytes uploaded from the user terminal
    during the sample period.
 """
 from itertools import chain
 import math
 import statistics
 import grpc
@ -233,7 +316,11 @@ class GrpcError(Exception):
 class ChannelContext:
-    """A wrapper for reusing an open grpc Channel across calls."""
+    """A wrapper for reusing an open grpc Channel across calls.
    `close()` should be called on the object when it is no longer
    in use.
    """
    def __init__(self, target="192.168.100.1:9200"):
        self.channel = None
        self.target = target
@ -258,9 +345,9 @@ def status_field_names():
        See module level docs regarding brackets in field names.
    Returns:
-        A tuple with 3 lists, the first with status data field names, the
+        A tuple with 3 lists, with status data field names, alert detail field
-        second with obstruction detail field names, and the third with alert
+        names, and obstruction detail field names to their respective values,
-        detail field names.
+        in that order.
    """
    alert_names = []
    for field in spacex.api.device.dish_pb2.DishAlerts.DESCRIPTOR.fields:
@ -284,6 +371,7 @@ def status_field_names():
        "seconds_obstructed",
    ], [
        "wedges_fraction_obstructed[12]",
        "valid_s",
    ], alert_names
@ -346,9 +434,9 @@ def status_data(context=None):
            across repeated calls.
    Returns:
-        A tuple with 3 dicts, the first mapping status data names to their
+        A tuple with 3 dicts, mapping status data field names, alert detail
-        values, the second mapping alert detail field names to their values,
+        field names, and obstruction detail field names to their respective
-        and the third mapping obstruction detail field names to their values.
+        values, in that order.
    Raises:
        GrpcError: Failed getting history info from the Starlink user
@ -387,6 +475,7 @@ def status_data(context=None):
        "seconds_obstructed": status.obstruction_stats.last_24h_obstructed_s,
    }, {
        "wedges_fraction_obstructed[]": status.obstruction_stats.wedge_abs_fraction_obstructed,
        "valid_s": status.obstruction_stats.valid_s,
    }, alerts
@ -415,15 +504,25 @@ def history_bulk_field_names():
 def history_ping_field_names():
    """Deprecated. Use history_stats_field_names instead."""
    return history_stats_field_names()[0:3]
 def history_stats_field_names():
    """Return the field names of the packet loss stats.
    Note:
        See module level docs regarding brackets in field names.
    Returns:
-        A tuple with 3 lists, the first with general data names, the second
+        A tuple with 6 lists, with general data names, ping drop stat names,
-        with ping drop stat names, and the third with ping drop run length
+        ping drop run length stat names, ping latency stat names, loaded ping
-        stat names.
+        latency stat names, and bandwidth usage stat names, in that order.
        Note:
            Additional lists may be added to this tuple in the future with
            additional data groups, so it not recommended for the caller to
            assume exactly 6 elements.
    """
    return [
        "samples",
@ -442,6 +541,20 @@ def history_ping_field_names():
        "final_run_fragment",
        "run_seconds[1,61]",
        "run_minutes[1,61]",
    ], [
        "mean_all_ping_latency",
        "deciles_all_ping_latency[11]",
        "mean_full_ping_latency",
        "deciles_full_ping_latency[11]",
        "stdev_full_ping_latency",
    ], [
        "load_bucket_samples[15]",
        "load_bucket_min_latency[15]",
        "load_bucket_median_latency[15]",
        "load_bucket_max_latency[15]",
    ], [
        "download_usage",
        "upload_usage",
    ]
@ -592,6 +705,11 @@ def history_bulk_data(parse_samples, start=None, verbose=False, context=None):
 def history_ping_stats(parse_samples, verbose=False, context=None):
    """Deprecated. Use history_stats instead."""
    return history_stats(parse_samples, verbose=verbose, context=context)[0:3]
 def history_stats(parse_samples, verbose=False, context=None):
    """Fetch, parse, and compute the packet loss stats.
    Note:
@ -605,9 +723,15 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
            across repeated calls.
    Returns:
-        A tuple with 3 dicts, the first mapping general data names to their
+        A tuple with 6 dicts, mapping general data names, ping drop stat
-        values, the second mapping ping drop stat names to their values and
+        names, ping drop run length stat names, ping latency stat names,
-        the third mapping ping drop run length stat names to their values.
+        loaded ping latency stat names, and bandwidth usage stat names to
        their respective values, in that order.
        Note:
            Additional dicts may be added to this tuple in the future with
            additional data groups, so it not recommended for the caller to
            assume exactly 6 elements.
    Raises:
        GrpcError: Failed getting history info from the Starlink user
@ -636,6 +760,13 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
    run_length = 0
    init_run_length = None
    usage_down = 0.0
    usage_up = 0.0
    rtt_full = []
    rtt_all = []
    rtt_buckets = [[] for _ in range(15)]
    for i in sample_range:
        d = history.pop_ping_drop_rate[i]
        if d >= 1:
@ -669,6 +800,22 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
                count_full_obstruct += 1
        tot += d
        down = history.downlink_throughput_bps[i]
        usage_down += down
        up = history.uplink_throughput_bps[i]
        usage_up += up
        rtt = history.pop_ping_latency_ms[i]
        # note that "full" here means the opposite of ping drop full
        if d == 0.0:
            rtt_full.append(rtt)
            if down + up > 500000:
                rtt_buckets[min(14, int(math.log2((down+up) / 500000)))].append(rtt)
            else:
                rtt_buckets[0].append(rtt)
        if d < 1.0:
            rtt_all.append((rtt, 1.0 - d))
    # If the entire sample set is one big drop run, it will be both initial
    # fragment (continued from prior sample range) and final one (continued
    # to next sample range), but to avoid double-reporting, just call it
@ -677,6 +824,53 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
        init_run_length = run_length
        run_length = 0
    def weighted_mean_and_quantiles(data, n):
        if not data:
            return None, [None] * (n+1)
        total_weight = sum(x[1] for x in data)
        result = []
        items = iter(data)
        value, accum_weight = next(items)
        accum_value = value * accum_weight
        for boundary in (total_weight * x / n for x in range(n)):
            while accum_weight < boundary:
                try:
                    value, weight = next(items)
                    accum_value += value * weight
                    accum_weight += weight
                except StopIteration:
                    # shouldn't happen, but in case of float precision weirdness...
                    break
            result.append(value)
        result.append(data[-1][0])
        accum_value += sum(x[0] for x in items)
        return accum_value / total_weight, result
    bucket_samples = []
    bucket_min = []
    bucket_median = []
    bucket_max = []
    for bucket in rtt_buckets:
        if bucket:
            bucket_samples.append(len(bucket))
            bucket_min.append(min(bucket))
            bucket_median.append(statistics.median(bucket))
            bucket_max.append(max(bucket))
        else:
            bucket_samples.append(0)
            bucket_min.append(None)
            bucket_median.append(None)
            bucket_max.append(None)
    rtt_all.sort(key=lambda x: x[0])
    wmean_all, wdeciles_all = weighted_mean_and_quantiles(rtt_all, 10)
    if rtt_full:
        deciles_full = [min(rtt_full)]
        deciles_full.extend(statistics.quantiles(rtt_full, n=10, method="inclusive"))
        deciles_full.append(max(rtt_full))
    else:
        deciles_full = [None] * 11
    return {
        "samples": parse_samples,
        "end_counter": current,
@ -694,4 +888,18 @@ def history_ping_stats(parse_samples, verbose=False, context=None):
        "final_run_fragment": run_length,
        "run_seconds[1,]": second_runs,
        "run_minutes[1,]": minute_runs,
    }, {
        "mean_all_ping_latency": wmean_all,
        "deciles_all_ping_latency[]": wdeciles_all,
        "mean_full_ping_latency": statistics.fmean(rtt_full) if rtt_full else None,
        "deciles_full_ping_latency[]": deciles_full,
        "stdev_full_ping_latency": statistics.pstdev(rtt_full) if rtt_full else None,
    }, {
        "load_bucket_samples[]": bucket_samples,
        "load_bucket_min_latency[]": bucket_min,
        "load_bucket_median_latency[]": bucket_median,
        "load_bucket_max_latency[]": bucket_max,
    }, {
        "download_usage": int(round(usage_down / 8)),
        "upload_usage": int(round(usage_up / 8)),
    }