feat: LLM routing by tier (free→Ollama, pro→Timeweb)

- Add tier-based provider routing in llm-svc - free tier → Ollama (local qwen3.5:9b) - pro/business → Timeweb Cloud AI - Add /api/v1/embed endpoint for embeddings via Ollama - Update Ollama client: qwen3.5:9b default, remove auth - Add GenerateEmbedding() function for qwen3-embedding:0.6b - Add Ollama K8s deployment with GPU support (RTX 4060 Ti) - Add monitoring stack (Prometheus, Grafana, Alertmanager) - Add Grafana dashboards for LLM and security metrics - Update deploy.sh with monitoring and Ollama deployment Made-with: Cursor
2026-03-03 02:25:22 +03:00
parent 5ac082a7c6
commit 7a40ff629e
19 changed files with 1759 additions and 35 deletions
--- a/backend/pkg/middleware/llm_limits.go
+++ b/backend/pkg/middleware/llm_limits.go
@@ -6,6 +6,7 @@ import (

 	"github.com/gofiber/fiber/v2"
 	"github.com/gooseek/backend/internal/usage"
+	"github.com/gooseek/backend/pkg/metrics"
 )

 type LLMLimitsConfig struct {
@@ -15,7 +16,11 @@ type LLMLimitsConfig struct {
 func LLMLimits(config LLMLimitsConfig) fiber.Handler {
 	return func(c *fiber.Ctx) error {
 		userID := GetUserID(c)
+		clientIP := c.IP()
+
 		if userID == "" {
+			metrics.RecordLLMUnauthorized("no_user_id", clientIP)
+			metrics.RecordSecurityEvent("unauthorized_llm_access", clientIP, "anonymous")
 			return c.Status(401).JSON(fiber.Map{
 				"error": "Authentication required",
 			})
@@ -30,6 +35,13 @@ func LLMLimits(config LLMLimitsConfig) fiber.Handler {
 			allowed, reason := config.UsageRepo.CheckLLMLimits(c.Context(), userID, tier)
 			if !allowed {
 				limits := usage.GetLimits(tier)
+
+				if tier == "free" {
+					metrics.RecordFreeTierLimitExceeded(userID, reason)
+					metrics.RecordSecurityEvent("free_tier_limit_exceeded", clientIP, userID)
+				}
+				metrics.RecordRateLimitHit("llm-svc", clientIP, reason)
+
 				return c.Status(429).JSON(fiber.Map{
 					"error":        reason,
 					"tier":         tier,