# Monitoring Stack: Prometheus + Grafana + AlertManager # Отслеживание безопасности, ресурсов, здоровья системы --- apiVersion: v1 kind: Namespace metadata: name: monitoring labels: app.kubernetes.io/name: monitoring --- # Prometheus ConfigMap apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monitoring data: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 rule_files: - /etc/prometheus/rules/*.yml scrape_configs: # Prometheus self-monitoring - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Kubernetes API server - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Kubernetes nodes - job_name: 'kubernetes-nodes' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics # Kubernetes pods (auto-discovery) - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # GooSeek services (direct) - job_name: 'gooseek-services' static_configs: - targets: - api-gateway.gooseek.svc:3015 - llm-svc.gooseek.svc:3020 - agent-svc.gooseek.svc:3018 - chat-svc.gooseek.svc:3005 - search-svc.gooseek.svc:3001 - learning-svc.gooseek.svc:3034 - travel-svc.gooseek.svc:3035 - medicine-svc.gooseek.svc:3037 metrics_path: /metrics relabel_configs: - source_labels: [__address__] regex: (.+)\.gooseek\.svc:(\d+) replacement: $1 target_label: service alerts.yml: | groups: - name: security rules: - alert: HighUnauthorizedRequests expr: rate(llm_unauthorized_requests_total[5m]) > 10 for: 2m labels: severity: critical annotations: summary: High unauthorized LLM requests description: More than 10 unauthorized requests per second - alert: FreeTierAbuse expr: rate(llm_free_tier_limit_exceeded_total[5m]) > 5 for: 5m labels: severity: warning annotations: summary: Free tier limit exceeded description: Users are exceeding LLM free tier limits - alert: SuspiciousActivity expr: sum by (client_ip) (rate(http_requests_total[5m])) > 100 for: 5m labels: severity: warning annotations: summary: Suspicious activity detected description: High request rate from single IP - name: resources rules: - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning annotations: summary: High CPU usage description: CPU usage is above 80 percent - alert: LowMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 20 for: 5m labels: severity: critical annotations: summary: Low memory available description: Less than 20 percent memory available - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 15 for: 5m labels: severity: critical annotations: summary: Low disk space description: Less than 15 percent disk space available - name: availability rules: - alert: ServiceDown expr: up{job="gooseek-services"} == 0 for: 2m labels: severity: critical annotations: summary: Service is down description: A GooSeek service is not responding - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 5 for: 5m labels: severity: warning annotations: summary: High latency detected description: P95 latency is above 5 seconds - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 5m labels: severity: critical annotations: summary: High error rate description: Error rate is above 5 percent --- # Prometheus RBAC apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] resources: - nodes - nodes/proxy - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: ["extensions"] resources: - ingresses verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: monitoring --- # Prometheus Deployment apiVersion: apps/v1 kind: Deployment metadata: name: prometheus namespace: monitoring labels: app: prometheus spec: replicas: 1 selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus containers: - name: prometheus image: prom/prometheus:v2.50.0 args: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention.time=30d" - "--web.enable-lifecycle" ports: - containerPort: 9090 volumeMounts: - name: config mountPath: /etc/prometheus - name: rules mountPath: /etc/prometheus/rules - name: data mountPath: /prometheus resources: requests: cpu: 200m memory: 512Mi limits: cpu: 1000m memory: 2Gi volumes: - name: config configMap: name: prometheus-config - name: rules configMap: name: prometheus-config items: - key: alerts.yml path: alerts.yml - name: data persistentVolumeClaim: claimName: prometheus-pvc --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-pvc namespace: monitoring spec: accessModes: - ReadWriteOnce resources: requests: storage: 20Gi --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitoring spec: type: ClusterIP selector: app: prometheus ports: - port: 9090 targetPort: 9090 --- # AlertManager ConfigMap apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-config namespace: monitoring data: alertmanager.yml: | global: resolve_timeout: 5m route: group_by: ['alertname', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'telegram' routes: - match: severity: critical receiver: 'telegram' continue: true receivers: - name: 'telegram' webhook_configs: - url: 'http://api-gateway.gooseek.svc:3015/api/v1/alerts/webhook' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname'] --- # AlertManager Deployment apiVersion: apps/v1 kind: Deployment metadata: name: alertmanager namespace: monitoring labels: app: alertmanager spec: replicas: 1 selector: matchLabels: app: alertmanager template: metadata: labels: app: alertmanager spec: containers: - name: alertmanager image: prom/alertmanager:v0.27.0 args: - "--config.file=/etc/alertmanager/alertmanager.yml" - "--storage.path=/alertmanager" ports: - containerPort: 9093 volumeMounts: - name: config mountPath: /etc/alertmanager - name: data mountPath: /alertmanager resources: requests: cpu: 50m memory: 64Mi limits: cpu: 200m memory: 256Mi volumes: - name: config configMap: name: alertmanager-config - name: data emptyDir: {} --- apiVersion: v1 kind: Service metadata: name: alertmanager namespace: monitoring spec: type: ClusterIP selector: app: alertmanager ports: - port: 9093 targetPort: 9093 --- # Grafana ConfigMap apiVersion: v1 kind: ConfigMap metadata: name: grafana-config namespace: monitoring data: grafana.ini: | [server] root_url = https://grafana.gooseek.ru [security] admin_user = admin admin_password = ${GRAFANA_ADMIN_PASSWORD} [auth.anonymous] enabled = false [dashboards] default_home_dashboard_path = /var/lib/grafana/dashboards/security.json datasources.yml: | apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: false --- # Grafana Deployment apiVersion: apps/v1 kind: Deployment metadata: name: grafana namespace: monitoring labels: app: grafana spec: replicas: 1 selector: matchLabels: app: grafana template: metadata: labels: app: grafana spec: containers: - name: grafana image: grafana/grafana:10.3.3 ports: - containerPort: 3000 env: - name: GF_SECURITY_ADMIN_PASSWORD valueFrom: secretKeyRef: name: grafana-secrets key: admin-password - name: GF_INSTALL_PLUGINS value: "grafana-piechart-panel,grafana-clock-panel" volumeMounts: - name: config mountPath: /etc/grafana/grafana.ini subPath: grafana.ini - name: datasources mountPath: /etc/grafana/provisioning/datasources - name: dashboards-config mountPath: /etc/grafana/provisioning/dashboards - name: dashboards mountPath: /var/lib/grafana/dashboards - name: data mountPath: /var/lib/grafana resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi volumes: - name: config configMap: name: grafana-config - name: datasources configMap: name: grafana-config items: - key: datasources.yml path: datasources.yml - name: dashboards-config configMap: name: grafana-dashboards-config - name: dashboards configMap: name: grafana-dashboards - name: data persistentVolumeClaim: claimName: grafana-pvc --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana-pvc namespace: monitoring spec: accessModes: - ReadWriteOnce resources: requests: storage: 5Gi --- apiVersion: v1 kind: Secret metadata: name: grafana-secrets namespace: monitoring type: Opaque stringData: admin-password: "${GRAFANA_ADMIN_PASSWORD}" --- apiVersion: v1 kind: Service metadata: name: grafana namespace: monitoring spec: type: ClusterIP selector: app: grafana ports: - port: 3000 targetPort: 3000 --- # Grafana Dashboards Config apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards-config namespace: monitoring data: dashboards.yml: | apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards --- # Grafana Ingress apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: grafana-ingress namespace: monitoring annotations: nginx.ingress.kubernetes.io/ssl-redirect: "true" cert-manager.io/cluster-issuer: "letsencrypt-prod" spec: ingressClassName: nginx tls: - hosts: - grafana.gooseek.ru secretName: grafana-tls rules: - host: grafana.gooseek.ru http: paths: - path: / pathType: Prefix backend: service: name: grafana port: number: 3000 --- # Node Exporter DaemonSet (для метрик хоста) apiVersion: apps/v1 kind: DaemonSet metadata: name: node-exporter namespace: monitoring labels: app: node-exporter spec: selector: matchLabels: app: node-exporter template: metadata: labels: app: node-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" spec: hostNetwork: true hostPID: true containers: - name: node-exporter image: prom/node-exporter:v1.7.0 args: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--path.rootfs=/host/root" - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" ports: - containerPort: 9100 volumeMounts: - name: proc mountPath: /host/proc readOnly: true - name: sys mountPath: /host/sys readOnly: true - name: root mountPath: /host/root readOnly: true resources: requests: cpu: 50m memory: 64Mi limits: cpu: 200m memory: 128Mi volumes: - name: proc hostPath: path: /proc - name: sys hostPath: path: /sys - name: root hostPath: path: / --- apiVersion: v1 kind: Service metadata: name: node-exporter namespace: monitoring annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" spec: type: ClusterIP selector: app: node-exporter ports: - port: 9100 targetPort: 9100