Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
title="Network tcp segments",
targets=[
g.Target(
expr="sum(rate(node_netstat_Tcp_InSegs[1m])) by (instance)",
legendFormat="InSegs {{instance}}",
),
g.Target(
expr="sum(rate(node_netstat_Tcp_OutSegs[1m])) by (instance)",
legendFormat="OutSegs {{instance}}",
),
g.Target(
expr="sum(rate(node_netstat_Tcp_RetransSegs[1m])) by (instance)",
legendFormat="RetransSegs {{instance}}",
),
],
yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10),
),
]
# The final dashboard must be named 'dashboard' so that grafanalib will find it.
dashboard = d.Dashboard(
title="Master dashboard",
refresh="",
rows=[
d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS),
d.Row(title="API call latency aggregated with quantile", panels=QUANTILE_API_CALL_LATENCY_PANELS, collapse=True),
d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True),
d.Row(title="etcd", panels=ETCD_PANELS, collapse=True),
d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True),
d.Row(
title="kube-controller-manager",
panels=[
),
d.simple_graph(
"Component restarts 2",
'sum(min_over_time(container_start_time_seconds{container!="",container!="POD"}[2m])) by (container)',
),
d.simple_graph(
"Active component", "sum(leader_election_master_status) by (name, instance)"
),
]
ETCD_PANELS = [
d.simple_graph("etcd leader", "etcd_server_is_leader", legend="{{instance}}"),
d.simple_graph(
"etcd bytes sent",
"rate(etcd_network_client_grpc_sent_bytes_total[1m])",
yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
legend="{{instance}}",
),
d.simple_graph(
"etcd operations rate",
d.one_line(
"""
sum(
rate(
etcd_request_duration_seconds_count{
operation=~"${etcd_operation:regex}",
type=~".*(${etcd_type:pipe})"
}[1m]
)
) by (operation, type)
"""
),
def api_call_latency(title, verb, scope, threshold):
return d.Graph(
title=title,
targets=[
g.Target(expr=str(threshold), legendFormat="threshold"),
g.Target(
expr=d.one_line(expression % {"verb": verb, "scope": scope}
),
# TODO(github.com/grafana/grafana/issues/19410): uncomment once fixed
# legendFormat="{{verb}} {{scope}}/{{resource}}",
),
],
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
)
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph("etcd instance id", "sum(etcd_server_id) by (instance, server_id)"),
d.simple_graph(
"etcd network latency (99th percentile)",
"histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (le, instance, To))",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"etcd compaction keys",
"delta(etcd_debugging_mvcc_db_compaction_keys_total[1m])",
),
d.simple_graph(
"etcd compaction pause sum duration",
"delta(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_sum[1m])",
yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT),
),
d.simple_graph(
"etcd compaction pause num chunks",
"delta(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_count[1m])",
),
d.simple_graph(
"etcd_disk_backend_commit_duration_seconds",
"histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) by (le, instance))",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"etcd wal fsync duration",
"histogram_quantile(1.0, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (le, endpoint))",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.Graph(