{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "grafana",
          "uid": "-- Grafana --"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "description": "RetroDash Errors golden signal dashboard for homelab monitoring with node_exporter and kube-state-metrics",
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 1,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "HTTP 5xx error rate as percentage of total requests",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "0": {
                  "color": "#33FF00",
                  "text": "GOOD"
                },
                "1": {
                  "color": "#FF4444",
                  "text": "ERROR"
                }
              },
              "type": "value"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "percentage",
            "steps": [
              {
                "color": "#33FF00",
                "value": null
              },
              {
                "color": "#FFCC00",
                "value": 5
              },
              {
                "color": "#FF4444",
                "value": 10
              }
            ]
          },
          "unit": "percent"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 6,
        "x": 0,
        "y": 0
      },
      "id": 1,
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "justifyMode": "center",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "text": {}
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "(\n  increase(http_requests_total{status=~\"5..\"}[5m]) /\n  increase(http_requests_total[5m])\n) * 100",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "Error Rate %",
      "transparent": true,
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Count of pods in Failed phase",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "0": {
                  "color": "#33FF00",
                  "text": "OK"
                },
                "1": {
                  "color": "#FF4444",
                  "text": "FAILED"
                }
              },
              "type": "value"
            }
          ],
          "max": 100,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "#33FF00",
                "value": null
              },
              {
                "color": "#FFCC00",
                "value": 1
              },
              {
                "color": "#FF4444",
                "value": 5
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 6,
        "x": 6,
        "y": 0
      },
      "id": 2,
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "justifyMode": "center",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "text": {}
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "count(kube_pod_status_phase{phase=\"Failed\"}) or vector(0)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "Failed Pods",
      "transparent": true,
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Pods waiting in CrashLoopBackOff state",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "0": {
                  "color": "#33FF00",
                  "text": "OK"
                },
                "1": {
                  "color": "#FF4444",
                  "text": "LOOP"
                }
              },
              "type": "value"
            }
          ],
          "max": 50,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "#33FF00",
                "value": null
              },
              {
                "color": "#FFCC00",
                "value": 1
              },
              {
                "color": "#FF4444",
                "value": 3
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 6,
        "x": 12,
        "y": 0
      },
      "id": 3,
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "justifyMode": "center",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "text": {}
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "count(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"}) or vector(0)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "CrashLoopBackOff",
      "transparent": true,
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Pods killed due to out-of-memory conditions",
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [
            {
              "options": {
                "0": {
                  "color": "#33FF00",
                  "text": "OK"
                },
                "1": {
                  "color": "#FF4444",
                  "text": "OOM"
                }
              },
              "type": "value"
            }
          ],
          "max": 50,
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "#33FF00",
                "value": null
              },
              {
                "color": "#FFCC00",
                "value": 1
              },
              {
                "color": "#FF4444",
                "value": 2
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 6,
        "x": 18,
        "y": 0
      },
      "id": 4,
      "options": {
        "colorMode": "background",
        "graphMode": "none",
        "justifyMode": "center",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "text": {}
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}) or vector(0)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "OOMKilled",
      "transparent": true,
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Error rate over time: HTTP 5xx rate and pod restart rate",
      "fieldConfig": {
        "defaults": {
          "color": {
            "by_field": {}
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 30,
            "gradientMode": "opacity",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "short"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "HTTP 5xx Rate"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#FF4444",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Pod Restart Rate"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#FFCC00",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 10,
        "w": 12,
        "x": 0,
        "y": 5
      },
      "id": 5,
      "options": {
        "legend": {
          "calcs": [
            "mean",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "rate(http_requests_total{status=~\"5..\"}[5m])",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "HTTP 5xx Rate",
          "refId": "A"
        },
        {
          "expr": "rate(kube_pod_container_status_restarts_total[5m])",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "Pod Restart Rate",
          "refId": "B"
        }
      ],
      "title": "Error Rate Over Time",
      "transparent": true,
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Pod restart timeline by pod namespace and name",
      "fieldConfig": {
        "defaults": {
          "color": {
            "by_field": {}
          },
          "custom": {
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 20,
            "gradientMode": "opacity",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "lineInterpolation": "linear",
            "lineWidth": 2,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": true,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "ops"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": ""
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#FFCC00",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 10,
        "w": 12,
        "x": 12,
        "y": 5
      },
      "id": 6,
      "options": {
        "legend": {
          "calcs": [
            "mean",
            "max"
          ],
          "displayMode": "table",
          "placement": "right",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "sum by (pod, namespace) (rate(kube_pod_container_status_restarts_total[5m]))",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{namespace}}/{{pod}}",
          "refId": "A"
        }
      ],
      "title": "Pod Restart Timeline",
      "transparent": true,
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Distribution of pod termination reasons",
      "fieldConfig": {
        "defaults": {
          "color": {
            "by_field": {}
          },
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "short"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "OOMKilled"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#FF4444",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "CrashLoopBackOff"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#FFCC00",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "ImagePullBackOff"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "#FF8800",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 0,
        "y": 15
      },
      "id": 7,
      "options": {
        "displayLabels": [
          "percent"
        ],
        "legend": {
          "calcs": [
            "value"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true,
          "values": [
            "value"
          ]
        },
        "pieType": "donut",
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "count by (reason) (kube_pod_container_status_last_terminated_reason)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{reason}}",
          "refId": "A"
        }
      ],
      "title": "Errors by Type",
      "transparent": true,
      "type": "piechart"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Recent pod error events with reason and occurrence count",
      "fieldConfig": {
        "defaults": {
          "custom": {
            "align": "auto",
            "cellOptions": {
              "type": "auto"
            },
            "inspect": false
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          }
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "Pod"
            },
            "properties": [
              {
                "id": "custom.width",
                "value": 200
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Reason"
            },
            "properties": [
              {
                "id": "custom.width",
                "value": 150
              },
              {
                "id": "color",
                "value": {
                  "mode": "thresholds"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "Count"
            },
            "properties": [
              {
                "id": "custom.width",
                "value": 80
              },
              {
                "id": "color",
                "value": {
                  "mode": "thresholds"
                }
              },
              {
                "id": "thresholds",
                "value": {
                  "mode": "absolute",
                  "steps": [
                    {
                      "color": "#33FF00",
                      "value": null
                    },
                    {
                      "color": "#FFCC00",
                      "value": 3
                    },
                    {
                      "color": "#FF4444",
                      "value": 10
                    }
                  ]
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 12,
        "y": 15
      },
      "id": 8,
      "options": {
        "footer": {
          "countRows": false,
          "fields": "",
          "reducer": [
            "sum"
          ],
          "show": false
        },
        "showHeader": true,
        "sortBy": []
      },
      "pluginVersion": "11.0.0",
      "targets": [
        {
          "expr": "topk(10, count by (pod, namespace, reason) (kube_pod_container_status_last_terminated_reason))",
          "format": "table",
          "instant": true,
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "title": "Recent Error Events",
      "transparent": true,
      "transformations": [
        {
          "id": "organize",
          "options": {
            "excludeByName": {
              "Time": true,
              "__name__": true
            },
            "indexByName": {},
            "renameByName": {
              "namespace": "Namespace",
              "pod": "Pod",
              "reason": "Reason",
              "Value": "Count"
            }
          }
        }
      ],
      "type": "table"
    }
  ],
  "refresh": "10s",
  "schemaVersion": 39,
  "tags": [
    "retrodash",
    "golden-signals",
    "errors"
  ],
  "templating": {
    "list": [
      {
        "current": {
          "selected": false,
          "text": "Prometheus",
          "value": "Prometheus"
        },
        "description": null,
        "error": null,
        "hide": 0,
        "includeAll": false,
        "label": "Datasource",
        "multi": false,
        "name": "DS_PROMETHEUS",
        "options": [],
        "query": "prometheus",
        "queryValue": "",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "datasource",
        "useTags": false
      },
      {
        "name": "job_node",
        "type": "custom",
        "label": "Node Exporter Job",
        "current": {
          "text": "node-exporter",
          "value": "node-exporter"
        },
        "options": [
          {
            "text": "node-exporter",
            "value": "node-exporter",
            "selected": true
          },
          {
            "text": "node",
            "value": "node",
            "selected": false
          }
        ],
        "query": "node-exporter,node"
      }
    ]
  },
  "time": {
    "from": "now-30m",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "UTC",
  "title": "RetroDash // ERRORS",
  "uid": "retrodash-errors",
  "version": 1,
  "weekStart": "",
  "__inputs": [
    {
      "name": "DS_PROMETHEUS",
      "label": "Prometheus",
      "description": "Prometheus datasource",
      "type": "datasource",
      "pluginId": "prometheus",
      "pluginName": "Prometheus"
    }
  ]
}
