Some checks failed
Build and Deploy GooSeek / build-and-deploy (push) Failing after 8m25s
- Add tier-based provider routing in llm-svc - free tier → Ollama (local qwen3.5:9b) - pro/business → Timeweb Cloud AI - Add /api/v1/embed endpoint for embeddings via Ollama - Update Ollama client: qwen3.5:9b default, remove auth - Add GenerateEmbedding() function for qwen3-embedding:0.6b - Add Ollama K8s deployment with GPU support (RTX 4060 Ti) - Add monitoring stack (Prometheus, Grafana, Alertmanager) - Add Grafana dashboards for LLM and security metrics - Update deploy.sh with monitoring and Ollama deployment Made-with: Cursor
675 lines
17 KiB
YAML
675 lines
17 KiB
YAML
# Monitoring Stack: Prometheus + Grafana + AlertManager
|
|
# Отслеживание безопасности, ресурсов, здоровья системы
|
|
---
|
|
apiVersion: v1
|
|
kind: Namespace
|
|
metadata:
|
|
name: monitoring
|
|
labels:
|
|
app.kubernetes.io/name: monitoring
|
|
---
|
|
# Prometheus ConfigMap
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-config
|
|
namespace: monitoring
|
|
data:
|
|
prometheus.yml: |
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- alertmanager:9093
|
|
|
|
rule_files:
|
|
- /etc/prometheus/rules/*.yml
|
|
|
|
scrape_configs:
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
# Kubernetes API server
|
|
- job_name: 'kubernetes-apiservers'
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
|
action: keep
|
|
regex: default;kubernetes;https
|
|
|
|
# Kubernetes nodes
|
|
- job_name: 'kubernetes-nodes'
|
|
scheme: https
|
|
tls_config:
|
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
kubernetes_sd_configs:
|
|
- role: node
|
|
relabel_configs:
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_node_label_(.+)
|
|
- target_label: __address__
|
|
replacement: kubernetes.default.svc:443
|
|
- source_labels: [__meta_kubernetes_node_name]
|
|
regex: (.+)
|
|
target_label: __metrics_path__
|
|
replacement: /api/v1/nodes/${1}/proxy/metrics
|
|
|
|
# Kubernetes pods (auto-discovery)
|
|
- job_name: 'kubernetes-pods'
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
|
action: keep
|
|
regex: true
|
|
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
|
action: replace
|
|
target_label: __metrics_path__
|
|
regex: (.+)
|
|
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
|
action: replace
|
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
|
replacement: $1:$2
|
|
target_label: __address__
|
|
- action: labelmap
|
|
regex: __meta_kubernetes_pod_label_(.+)
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
action: replace
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
action: replace
|
|
target_label: kubernetes_pod_name
|
|
|
|
# GooSeek services (direct)
|
|
- job_name: 'gooseek-services'
|
|
static_configs:
|
|
- targets:
|
|
- api-gateway.gooseek.svc:3015
|
|
- llm-svc.gooseek.svc:3020
|
|
- agent-svc.gooseek.svc:3018
|
|
- chat-svc.gooseek.svc:3005
|
|
- search-svc.gooseek.svc:3001
|
|
- learning-svc.gooseek.svc:3034
|
|
- travel-svc.gooseek.svc:3035
|
|
- medicine-svc.gooseek.svc:3037
|
|
metrics_path: /metrics
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
regex: (.+)\.gooseek\.svc:(\d+)
|
|
replacement: $1
|
|
target_label: service
|
|
|
|
alerts.yml: |
|
|
groups:
|
|
- name: security
|
|
rules:
|
|
- alert: HighUnauthorizedRequests
|
|
expr: rate(llm_unauthorized_requests_total[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: High unauthorized LLM requests
|
|
description: More than 10 unauthorized requests per second
|
|
|
|
- alert: FreeTierAbuse
|
|
expr: rate(llm_free_tier_limit_exceeded_total[5m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Free tier limit exceeded
|
|
description: Users are exceeding LLM free tier limits
|
|
|
|
- alert: SuspiciousActivity
|
|
expr: sum by (client_ip) (rate(http_requests_total[5m])) > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Suspicious activity detected
|
|
description: High request rate from single IP
|
|
|
|
- name: resources
|
|
rules:
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: High CPU usage
|
|
description: CPU usage is above 80 percent
|
|
|
|
- alert: LowMemory
|
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 20
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Low memory available
|
|
description: Less than 20 percent memory available
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 15
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Low disk space
|
|
description: Less than 15 percent disk space available
|
|
|
|
- name: availability
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: up{job="gooseek-services"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Service is down
|
|
description: A GooSeek service is not responding
|
|
|
|
- alert: HighLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: High latency detected
|
|
description: P95 latency is above 5 seconds
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: High error rate
|
|
description: Error rate is above 5 percent
|
|
---
|
|
# Prometheus RBAC
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
name: prometheus
|
|
rules:
|
|
- apiGroups: [""]
|
|
resources:
|
|
- nodes
|
|
- nodes/proxy
|
|
- services
|
|
- endpoints
|
|
- pods
|
|
verbs: ["get", "list", "watch"]
|
|
- apiGroups: ["extensions"]
|
|
resources:
|
|
- ingresses
|
|
verbs: ["get", "list", "watch"]
|
|
- nonResourceURLs: ["/metrics"]
|
|
verbs: ["get"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: prometheus
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: prometheus
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: prometheus
|
|
namespace: monitoring
|
|
---
|
|
# Prometheus Deployment
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: prometheus
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
serviceAccountName: prometheus
|
|
containers:
|
|
- name: prometheus
|
|
image: prom/prometheus:v2.50.0
|
|
args:
|
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
- "--storage.tsdb.path=/prometheus"
|
|
- "--storage.tsdb.retention.time=30d"
|
|
- "--web.enable-lifecycle"
|
|
ports:
|
|
- containerPort: 9090
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/prometheus
|
|
- name: rules
|
|
mountPath: /etc/prometheus/rules
|
|
- name: data
|
|
mountPath: /prometheus
|
|
resources:
|
|
requests:
|
|
cpu: 200m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: prometheus-config
|
|
- name: rules
|
|
configMap:
|
|
name: prometheus-config
|
|
items:
|
|
- key: alerts.yml
|
|
path: alerts.yml
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: prometheus-pvc
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: prometheus-pvc
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
resources:
|
|
requests:
|
|
storage: 20Gi
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: prometheus
|
|
namespace: monitoring
|
|
spec:
|
|
type: ClusterIP
|
|
selector:
|
|
app: prometheus
|
|
ports:
|
|
- port: 9090
|
|
targetPort: 9090
|
|
---
|
|
# AlertManager ConfigMap
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: alertmanager-config
|
|
namespace: monitoring
|
|
data:
|
|
alertmanager.yml: |
|
|
global:
|
|
resolve_timeout: 5m
|
|
|
|
route:
|
|
group_by: ['alertname', 'severity']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
receiver: 'telegram'
|
|
routes:
|
|
- match:
|
|
severity: critical
|
|
receiver: 'telegram'
|
|
continue: true
|
|
|
|
receivers:
|
|
- name: 'telegram'
|
|
webhook_configs:
|
|
- url: 'http://api-gateway.gooseek.svc:3015/api/v1/alerts/webhook'
|
|
send_resolved: true
|
|
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: 'critical'
|
|
target_match:
|
|
severity: 'warning'
|
|
equal: ['alertname']
|
|
---
|
|
# AlertManager Deployment
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: alertmanager
|
|
namespace: monitoring
|
|
labels:
|
|
app: alertmanager
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: alertmanager
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: alertmanager
|
|
spec:
|
|
containers:
|
|
- name: alertmanager
|
|
image: prom/alertmanager:v0.27.0
|
|
args:
|
|
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
|
- "--storage.path=/alertmanager"
|
|
ports:
|
|
- containerPort: 9093
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/alertmanager
|
|
- name: data
|
|
mountPath: /alertmanager
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 256Mi
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: alertmanager-config
|
|
- name: data
|
|
emptyDir: {}
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: alertmanager
|
|
namespace: monitoring
|
|
spec:
|
|
type: ClusterIP
|
|
selector:
|
|
app: alertmanager
|
|
ports:
|
|
- port: 9093
|
|
targetPort: 9093
|
|
---
|
|
# Grafana ConfigMap
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-config
|
|
namespace: monitoring
|
|
data:
|
|
grafana.ini: |
|
|
[server]
|
|
root_url = https://grafana.gooseek.ru
|
|
|
|
[security]
|
|
admin_user = admin
|
|
admin_password = ${GRAFANA_ADMIN_PASSWORD}
|
|
|
|
[auth.anonymous]
|
|
enabled = false
|
|
|
|
[dashboards]
|
|
default_home_dashboard_path = /var/lib/grafana/dashboards/security.json
|
|
|
|
datasources.yml: |
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://prometheus:9090
|
|
isDefault: true
|
|
editable: false
|
|
---
|
|
# Grafana Deployment
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: grafana
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: grafana
|
|
spec:
|
|
containers:
|
|
- name: grafana
|
|
image: grafana/grafana:10.3.3
|
|
ports:
|
|
- containerPort: 3000
|
|
env:
|
|
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: grafana-secrets
|
|
key: admin-password
|
|
- name: GF_INSTALL_PLUGINS
|
|
value: "grafana-piechart-panel,grafana-clock-panel"
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/grafana/grafana.ini
|
|
subPath: grafana.ini
|
|
- name: datasources
|
|
mountPath: /etc/grafana/provisioning/datasources
|
|
- name: dashboards-config
|
|
mountPath: /etc/grafana/provisioning/dashboards
|
|
- name: dashboards
|
|
mountPath: /var/lib/grafana/dashboards
|
|
- name: data
|
|
mountPath: /var/lib/grafana
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: grafana-config
|
|
- name: datasources
|
|
configMap:
|
|
name: grafana-config
|
|
items:
|
|
- key: datasources.yml
|
|
path: datasources.yml
|
|
- name: dashboards-config
|
|
configMap:
|
|
name: grafana-dashboards-config
|
|
- name: dashboards
|
|
configMap:
|
|
name: grafana-dashboards
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: grafana-pvc
|
|
---
|
|
apiVersion: v1
|
|
kind: PersistentVolumeClaim
|
|
metadata:
|
|
name: grafana-pvc
|
|
namespace: monitoring
|
|
spec:
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
resources:
|
|
requests:
|
|
storage: 5Gi
|
|
---
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: grafana-secrets
|
|
namespace: monitoring
|
|
type: Opaque
|
|
stringData:
|
|
admin-password: "${GRAFANA_ADMIN_PASSWORD}"
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: grafana
|
|
namespace: monitoring
|
|
spec:
|
|
type: ClusterIP
|
|
selector:
|
|
app: grafana
|
|
ports:
|
|
- port: 3000
|
|
targetPort: 3000
|
|
---
|
|
# Grafana Dashboards Config
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboards-config
|
|
namespace: monitoring
|
|
data:
|
|
dashboards.yml: |
|
|
apiVersion: 1
|
|
providers:
|
|
- name: 'default'
|
|
orgId: 1
|
|
folder: ''
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards
|
|
---
|
|
# Grafana Ingress
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: Ingress
|
|
metadata:
|
|
name: grafana-ingress
|
|
namespace: monitoring
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
|
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
|
spec:
|
|
ingressClassName: nginx
|
|
tls:
|
|
- hosts:
|
|
- grafana.gooseek.ru
|
|
secretName: grafana-tls
|
|
rules:
|
|
- host: grafana.gooseek.ru
|
|
http:
|
|
paths:
|
|
- path: /
|
|
pathType: Prefix
|
|
backend:
|
|
service:
|
|
name: grafana
|
|
port:
|
|
number: 3000
|
|
---
|
|
# Node Exporter DaemonSet (для метрик хоста)
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
labels:
|
|
app: node-exporter
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: node-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: node-exporter
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
prometheus.io/port: "9100"
|
|
spec:
|
|
hostNetwork: true
|
|
hostPID: true
|
|
containers:
|
|
- name: node-exporter
|
|
image: prom/node-exporter:v1.7.0
|
|
args:
|
|
- "--path.procfs=/host/proc"
|
|
- "--path.sysfs=/host/sys"
|
|
- "--path.rootfs=/host/root"
|
|
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
|
|
ports:
|
|
- containerPort: 9100
|
|
volumeMounts:
|
|
- name: proc
|
|
mountPath: /host/proc
|
|
readOnly: true
|
|
- name: sys
|
|
mountPath: /host/sys
|
|
readOnly: true
|
|
- name: root
|
|
mountPath: /host/root
|
|
readOnly: true
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
volumes:
|
|
- name: proc
|
|
hostPath:
|
|
path: /proc
|
|
- name: sys
|
|
hostPath:
|
|
path: /sys
|
|
- name: root
|
|
hostPath:
|
|
path: /
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: monitoring
|
|
annotations:
|
|
prometheus.io/scrape: "true"
|
|
prometheus.io/port: "9100"
|
|
spec:
|
|
type: ClusterIP
|
|
selector:
|
|
app: node-exporter
|
|
ports:
|
|
- port: 9100
|
|
targetPort: 9100
|