feat: LLM routing by tier (free→Ollama, pro→Timeweb)
Some checks failed
Build and Deploy GooSeek / build-and-deploy (push) Failing after 8m25s

- Add tier-based provider routing in llm-svc
  - free tier → Ollama (local qwen3.5:9b)
  - pro/business → Timeweb Cloud AI
- Add /api/v1/embed endpoint for embeddings via Ollama
- Update Ollama client: qwen3.5:9b default, remove auth
- Add GenerateEmbedding() function for qwen3-embedding:0.6b
- Add Ollama K8s deployment with GPU support (RTX 4060 Ti)
- Add monitoring stack (Prometheus, Grafana, Alertmanager)
- Add Grafana dashboards for LLM and security metrics
- Update deploy.sh with monitoring and Ollama deployment

Made-with: Cursor
This commit is contained in:
home
2026-03-03 02:25:22 +03:00
parent 5ac082a7c6
commit 7a40ff629e
19 changed files with 1759 additions and 35 deletions

View File

@@ -15,6 +15,7 @@ import (
"github.com/gooseek/backend/internal/llm"
"github.com/gooseek/backend/internal/usage"
"github.com/gooseek/backend/pkg/config"
"github.com/gooseek/backend/pkg/metrics"
"github.com/gooseek/backend/pkg/middleware"
"github.com/gooseek/backend/pkg/ndjson"
_ "github.com/lib/pq"
@@ -34,6 +35,51 @@ type GenerateRequest struct {
} `json:"options"`
}
type EmbedRequest struct {
Input string `json:"input"`
Model string `json:"model,omitempty"`
}
type ProviderRouting struct {
ProviderID string
ModelKey string
}
func resolveProvider(cfg *config.Config, tier string, requestedProvider string, requestedModel string) ProviderRouting {
if tier == "free" || tier == "" {
return ProviderRouting{
ProviderID: "ollama",
ModelKey: cfg.OllamaModelKey,
}
}
if requestedProvider != "" && requestedProvider != "auto" {
return ProviderRouting{
ProviderID: requestedProvider,
ModelKey: requestedModel,
}
}
if cfg.TimewebAgentAccessID != "" && cfg.TimewebAPIKey != "" {
return ProviderRouting{
ProviderID: "timeweb",
ModelKey: requestedModel,
}
}
if cfg.OpenAIAPIKey != "" {
return ProviderRouting{
ProviderID: "openai",
ModelKey: "gpt-4o-mini",
}
}
return ProviderRouting{
ProviderID: "ollama",
ModelKey: cfg.OllamaModelKey,
}
}
func main() {
cfg, err := config.Load()
if err != nil {
@@ -70,19 +116,46 @@ func main() {
app.Use(logger.New())
app.Use(cors.New())
app.Use(metrics.PrometheusMiddleware(metrics.MetricsConfig{
ServiceName: "llm-svc",
}))
app.Get("/health", func(c *fiber.Ctx) error {
return c.JSON(fiber.Map{"status": "ok"})
})
app.Get("/ready", func(c *fiber.Ctx) error {
return c.JSON(fiber.Map{"status": "ready"})
})
app.Get("/metrics", metrics.MetricsHandler())
app.Get("/api/v1/providers", func(c *fiber.Ctx) error {
providers := []fiber.Map{}
providers = append(providers, fiber.Map{
"id": "ollama",
"name": "GooSeek AI (Бесплатно)",
"models": []string{cfg.OllamaModelKey},
"tier": "free",
"isLocal": true,
})
if cfg.TimewebAgentAccessID != "" && cfg.TimewebAPIKey != "" {
providers = append(providers, fiber.Map{
"id": "timeweb",
"name": "Timeweb Cloud AI (Pro)",
"models": []string{"gpt-4o", "gpt-4o-mini", "claude-3-5-sonnet", "gemini-1.5-pro"},
"tier": "pro",
})
}
if cfg.OpenAIAPIKey != "" {
providers = append(providers, fiber.Map{
"id": "openai",
"name": "OpenAI",
"models": []string{"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"},
"models": []string{"gpt-4o", "gpt-4o-mini", "gpt-4-turbo"},
"tier": "pro",
})
}
@@ -90,7 +163,8 @@ func main() {
providers = append(providers, fiber.Map{
"id": "anthropic",
"name": "Anthropic",
"models": []string{"claude-3-5-sonnet-20241022", "claude-3-opus-20240229", "claude-3-haiku-20240307"},
"models": []string{"claude-3-5-sonnet-20241022", "claude-3-opus-20240229"},
"tier": "pro",
})
}
@@ -99,6 +173,7 @@ func main() {
"id": "gemini",
"name": "Google Gemini",
"models": []string{"gemini-1.5-pro", "gemini-1.5-flash", "gemini-2.0-flash-exp"},
"tier": "pro",
})
}
@@ -123,31 +198,49 @@ func main() {
}))
llmAPI.Post("/generate", func(c *fiber.Ctx) error {
startTime := time.Now()
userID := middleware.GetUserID(c)
tier := middleware.GetUserTier(c)
clientIP := c.IP()
if tier == "" {
tier = "free"
}
var req GenerateRequest
if err := c.BodyParser(&req); err != nil {
metrics.RecordLLMError(req.ProviderID, "invalid_request")
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
}
if len(req.Messages) == 0 {
metrics.RecordLLMError(req.ProviderID, "empty_messages")
return c.Status(400).JSON(fiber.Map{"error": "Messages required"})
}
limits := usage.GetLimits(tier)
if req.Options.MaxTokens == 0 || req.Options.MaxTokens > limits.MaxTokensPerReq {
if tier == "free" && req.Options.MaxTokens > limits.MaxTokensPerReq {
metrics.RecordFreeTierLimitExceeded(userID, "max_tokens")
}
req.Options.MaxTokens = limits.MaxTokensPerReq
}
routing := resolveProvider(cfg, tier, req.ProviderID, req.ModelKey)
providerID := routing.ProviderID
modelKey := routing.ModelKey
metrics.RecordLLMRequest(providerID, modelKey, tier, userID)
client, err := llm.NewClient(llm.ProviderConfig{
ProviderID: req.ProviderID,
ModelKey: req.ModelKey,
APIKey: getAPIKey(cfg, req.ProviderID),
ProviderID: providerID,
ModelKey: modelKey,
APIKey: getAPIKey(cfg, providerID),
BaseURL: getBaseURL(cfg, providerID),
AgentAccessID: cfg.TimewebAgentAccessID,
})
if err != nil {
metrics.RecordLLMError(req.ProviderID, "client_init_error")
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
}
@@ -171,6 +264,8 @@ func main() {
},
})
if err != nil {
metrics.RecordLLMError(providerID, "stream_error")
metrics.RecordSecurityEvent("llm_error", clientIP, userID)
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
}
@@ -179,14 +274,23 @@ func main() {
c.Context().SetBodyStreamWriter(func(w *bufio.Writer) {
writer := ndjson.NewWriter(w)
tokenCount := 0
for chunk := range stream {
writer.Write(fiber.Map{
"type": "chunk",
"chunk": chunk.ContentChunk,
})
w.Flush()
tokenCount += len(chunk.ContentChunk) / 4
}
writer.Write(fiber.Map{"type": "done"})
metrics.RecordLLMLatency(providerID, modelKey, time.Since(startTime))
metrics.RecordLLMTokens(providerID, tier, userID, tokenCount)
if usageRepo != nil {
go usageRepo.IncrementLLMUsage(context.Background(), userID, tier, tokenCount)
}
})
return nil
@@ -200,11 +304,16 @@ func main() {
},
})
if err != nil {
metrics.RecordLLMError(providerID, "generate_error")
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
}
tokenCount := len(response) / 4
metrics.RecordLLMLatency(providerID, modelKey, time.Since(startTime))
metrics.RecordLLMTokens(providerID, tier, userID, tokenCount)
if usageRepo != nil {
go usageRepo.IncrementLLMUsage(context.Background(), userID, tier, len(response)/4)
go usageRepo.IncrementLLMUsage(context.Background(), userID, tier, tokenCount)
}
return c.JSON(fiber.Map{
@@ -213,7 +322,39 @@ func main() {
})
llmAPI.Post("/embed", func(c *fiber.Ctx) error {
return c.Status(501).JSON(fiber.Map{"error": "Not implemented"})
userID := middleware.GetUserID(c)
tier := middleware.GetUserTier(c)
if tier == "" {
tier = "free"
}
var req EmbedRequest
if err := c.BodyParser(&req); err != nil {
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
}
if req.Input == "" {
return c.Status(400).JSON(fiber.Map{"error": "Input text required"})
}
model := req.Model
if model == "" {
model = cfg.OllamaEmbeddingModel
}
embeddings, err := llm.GenerateEmbedding(cfg.OllamaBaseURL, model, req.Input)
if err != nil {
metrics.RecordLLMError("ollama", "embed_error")
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
}
metrics.RecordLLMRequest("ollama", model, tier, userID)
return c.JSON(fiber.Map{
"embedding": embeddings,
"model": model,
})
})
port := cfg.LLMSvcPort
@@ -223,8 +364,10 @@ func main() {
func getAPIKey(cfg *config.Config, providerID string) string {
switch providerID {
case "openai", "timeweb":
case "openai":
return cfg.OpenAIAPIKey
case "timeweb":
return cfg.TimewebAPIKey
case "anthropic":
return cfg.AnthropicAPIKey
case "gemini", "google":
@@ -234,6 +377,17 @@ func getAPIKey(cfg *config.Config, providerID string) string {
}
}
func getBaseURL(cfg *config.Config, providerID string) string {
switch providerID {
case "timeweb":
return cfg.TimewebAPIBaseURL
case "ollama":
return cfg.OllamaBaseURL
default:
return ""
}
}
func init() {
if os.Getenv("PORT") == "" {
os.Setenv("PORT", "3020")