feat: LLM routing by tier (free→Ollama, pro→Timeweb)
Some checks failed
Build and Deploy GooSeek / build-and-deploy (push) Failing after 8m25s
Some checks failed
Build and Deploy GooSeek / build-and-deploy (push) Failing after 8m25s
- Add tier-based provider routing in llm-svc - free tier → Ollama (local qwen3.5:9b) - pro/business → Timeweb Cloud AI - Add /api/v1/embed endpoint for embeddings via Ollama - Update Ollama client: qwen3.5:9b default, remove auth - Add GenerateEmbedding() function for qwen3-embedding:0.6b - Add Ollama K8s deployment with GPU support (RTX 4060 Ti) - Add monitoring stack (Prometheus, Grafana, Alertmanager) - Add Grafana dashboards for LLM and security metrics - Update deploy.sh with monitoring and Ollama deployment Made-with: Cursor
This commit is contained in:
85
backend/deploy/k8s/ollama-models.yaml
Normal file
85
backend/deploy/k8s/ollama-models.yaml
Normal file
@@ -0,0 +1,85 @@
|
||||
# Job для загрузки моделей Ollama после деплоя
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: ollama-model-loader
|
||||
namespace: gooseek
|
||||
labels:
|
||||
app: ollama-model-loader
|
||||
spec:
|
||||
ttlSecondsAfterFinished: 3600
|
||||
backoffLimit: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ollama-model-loader
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
initContainers:
|
||||
- name: wait-for-ollama
|
||||
image: curlimages/curl:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Waiting for Ollama to be ready..."
|
||||
until curl -sf http://ollama.gooseek.svc.cluster.local:11434/api/tags; do
|
||||
echo "Ollama not ready, retrying in 5s..."
|
||||
sleep 5
|
||||
done
|
||||
echo "Ollama is ready!"
|
||||
containers:
|
||||
- name: model-loader
|
||||
image: ollama/ollama:latest
|
||||
env:
|
||||
- name: OLLAMA_HOST
|
||||
value: "http://ollama.gooseek.svc.cluster.local:11434"
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
|
||||
OLLAMA_URL="http://ollama.gooseek.svc.cluster.local:11434"
|
||||
|
||||
pull_model() {
|
||||
MODEL=$1
|
||||
echo "=== Pulling model: $MODEL ==="
|
||||
|
||||
# Check if model already exists
|
||||
EXISTING=$(curl -sf "$OLLAMA_URL/api/tags" | grep -o "\"name\":\"$MODEL\"" || true)
|
||||
if [ -n "$EXISTING" ]; then
|
||||
echo "Model $MODEL already exists, skipping..."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Pull model via API
|
||||
echo "Downloading $MODEL..."
|
||||
curl -sf "$OLLAMA_URL/api/pull" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\": \"$MODEL\", \"stream\": false}" \
|
||||
--max-time 1800
|
||||
|
||||
echo "Model $MODEL downloaded successfully!"
|
||||
}
|
||||
|
||||
echo "=== Ollama Model Loader ==="
|
||||
echo "Target: $OLLAMA_URL"
|
||||
|
||||
# Основная модель генерации (4 параллельных воркера)
|
||||
pull_model "qwen3.5:9b"
|
||||
|
||||
# Embedding модель (быстрые эмбеддинги)
|
||||
pull_model "qwen3-embedding:0.6b"
|
||||
|
||||
echo ""
|
||||
echo "=== All models loaded ==="
|
||||
curl -sf "$OLLAMA_URL/api/tags" | head -c 500
|
||||
echo ""
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
Reference in New Issue
Block a user