feat: LLM routing by tier (free→Ollama, pro→Timeweb)
Some checks failed
Build and Deploy GooSeek / build-and-deploy (push) Failing after 8m25s
Some checks failed
Build and Deploy GooSeek / build-and-deploy (push) Failing after 8m25s
- Add tier-based provider routing in llm-svc - free tier → Ollama (local qwen3.5:9b) - pro/business → Timeweb Cloud AI - Add /api/v1/embed endpoint for embeddings via Ollama - Update Ollama client: qwen3.5:9b default, remove auth - Add GenerateEmbedding() function for qwen3-embedding:0.6b - Add Ollama K8s deployment with GPU support (RTX 4060 Ti) - Add monitoring stack (Prometheus, Grafana, Alertmanager) - Add Grafana dashboards for LLM and security metrics - Update deploy.sh with monitoring and Ollama deployment Made-with: Cursor
This commit is contained in:
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/gooseek/backend/internal/llm"
|
||||
"github.com/gooseek/backend/internal/usage"
|
||||
"github.com/gooseek/backend/pkg/config"
|
||||
"github.com/gooseek/backend/pkg/metrics"
|
||||
"github.com/gooseek/backend/pkg/middleware"
|
||||
"github.com/gooseek/backend/pkg/ndjson"
|
||||
_ "github.com/lib/pq"
|
||||
@@ -34,6 +35,51 @@ type GenerateRequest struct {
|
||||
} `json:"options"`
|
||||
}
|
||||
|
||||
type EmbedRequest struct {
|
||||
Input string `json:"input"`
|
||||
Model string `json:"model,omitempty"`
|
||||
}
|
||||
|
||||
type ProviderRouting struct {
|
||||
ProviderID string
|
||||
ModelKey string
|
||||
}
|
||||
|
||||
func resolveProvider(cfg *config.Config, tier string, requestedProvider string, requestedModel string) ProviderRouting {
|
||||
if tier == "free" || tier == "" {
|
||||
return ProviderRouting{
|
||||
ProviderID: "ollama",
|
||||
ModelKey: cfg.OllamaModelKey,
|
||||
}
|
||||
}
|
||||
|
||||
if requestedProvider != "" && requestedProvider != "auto" {
|
||||
return ProviderRouting{
|
||||
ProviderID: requestedProvider,
|
||||
ModelKey: requestedModel,
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.TimewebAgentAccessID != "" && cfg.TimewebAPIKey != "" {
|
||||
return ProviderRouting{
|
||||
ProviderID: "timeweb",
|
||||
ModelKey: requestedModel,
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.OpenAIAPIKey != "" {
|
||||
return ProviderRouting{
|
||||
ProviderID: "openai",
|
||||
ModelKey: "gpt-4o-mini",
|
||||
}
|
||||
}
|
||||
|
||||
return ProviderRouting{
|
||||
ProviderID: "ollama",
|
||||
ModelKey: cfg.OllamaModelKey,
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
cfg, err := config.Load()
|
||||
if err != nil {
|
||||
@@ -70,19 +116,46 @@ func main() {
|
||||
|
||||
app.Use(logger.New())
|
||||
app.Use(cors.New())
|
||||
app.Use(metrics.PrometheusMiddleware(metrics.MetricsConfig{
|
||||
ServiceName: "llm-svc",
|
||||
}))
|
||||
|
||||
app.Get("/health", func(c *fiber.Ctx) error {
|
||||
return c.JSON(fiber.Map{"status": "ok"})
|
||||
})
|
||||
|
||||
app.Get("/ready", func(c *fiber.Ctx) error {
|
||||
return c.JSON(fiber.Map{"status": "ready"})
|
||||
})
|
||||
|
||||
app.Get("/metrics", metrics.MetricsHandler())
|
||||
|
||||
app.Get("/api/v1/providers", func(c *fiber.Ctx) error {
|
||||
providers := []fiber.Map{}
|
||||
|
||||
providers = append(providers, fiber.Map{
|
||||
"id": "ollama",
|
||||
"name": "GooSeek AI (Бесплатно)",
|
||||
"models": []string{cfg.OllamaModelKey},
|
||||
"tier": "free",
|
||||
"isLocal": true,
|
||||
})
|
||||
|
||||
if cfg.TimewebAgentAccessID != "" && cfg.TimewebAPIKey != "" {
|
||||
providers = append(providers, fiber.Map{
|
||||
"id": "timeweb",
|
||||
"name": "Timeweb Cloud AI (Pro)",
|
||||
"models": []string{"gpt-4o", "gpt-4o-mini", "claude-3-5-sonnet", "gemini-1.5-pro"},
|
||||
"tier": "pro",
|
||||
})
|
||||
}
|
||||
|
||||
if cfg.OpenAIAPIKey != "" {
|
||||
providers = append(providers, fiber.Map{
|
||||
"id": "openai",
|
||||
"name": "OpenAI",
|
||||
"models": []string{"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"},
|
||||
"models": []string{"gpt-4o", "gpt-4o-mini", "gpt-4-turbo"},
|
||||
"tier": "pro",
|
||||
})
|
||||
}
|
||||
|
||||
@@ -90,7 +163,8 @@ func main() {
|
||||
providers = append(providers, fiber.Map{
|
||||
"id": "anthropic",
|
||||
"name": "Anthropic",
|
||||
"models": []string{"claude-3-5-sonnet-20241022", "claude-3-opus-20240229", "claude-3-haiku-20240307"},
|
||||
"models": []string{"claude-3-5-sonnet-20241022", "claude-3-opus-20240229"},
|
||||
"tier": "pro",
|
||||
})
|
||||
}
|
||||
|
||||
@@ -99,6 +173,7 @@ func main() {
|
||||
"id": "gemini",
|
||||
"name": "Google Gemini",
|
||||
"models": []string{"gemini-1.5-pro", "gemini-1.5-flash", "gemini-2.0-flash-exp"},
|
||||
"tier": "pro",
|
||||
})
|
||||
}
|
||||
|
||||
@@ -123,31 +198,49 @@ func main() {
|
||||
}))
|
||||
|
||||
llmAPI.Post("/generate", func(c *fiber.Ctx) error {
|
||||
startTime := time.Now()
|
||||
userID := middleware.GetUserID(c)
|
||||
tier := middleware.GetUserTier(c)
|
||||
clientIP := c.IP()
|
||||
|
||||
if tier == "" {
|
||||
tier = "free"
|
||||
}
|
||||
|
||||
var req GenerateRequest
|
||||
if err := c.BodyParser(&req); err != nil {
|
||||
metrics.RecordLLMError(req.ProviderID, "invalid_request")
|
||||
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
if len(req.Messages) == 0 {
|
||||
metrics.RecordLLMError(req.ProviderID, "empty_messages")
|
||||
return c.Status(400).JSON(fiber.Map{"error": "Messages required"})
|
||||
}
|
||||
|
||||
limits := usage.GetLimits(tier)
|
||||
if req.Options.MaxTokens == 0 || req.Options.MaxTokens > limits.MaxTokensPerReq {
|
||||
if tier == "free" && req.Options.MaxTokens > limits.MaxTokensPerReq {
|
||||
metrics.RecordFreeTierLimitExceeded(userID, "max_tokens")
|
||||
}
|
||||
req.Options.MaxTokens = limits.MaxTokensPerReq
|
||||
}
|
||||
|
||||
routing := resolveProvider(cfg, tier, req.ProviderID, req.ModelKey)
|
||||
providerID := routing.ProviderID
|
||||
modelKey := routing.ModelKey
|
||||
|
||||
metrics.RecordLLMRequest(providerID, modelKey, tier, userID)
|
||||
|
||||
client, err := llm.NewClient(llm.ProviderConfig{
|
||||
ProviderID: req.ProviderID,
|
||||
ModelKey: req.ModelKey,
|
||||
APIKey: getAPIKey(cfg, req.ProviderID),
|
||||
ProviderID: providerID,
|
||||
ModelKey: modelKey,
|
||||
APIKey: getAPIKey(cfg, providerID),
|
||||
BaseURL: getBaseURL(cfg, providerID),
|
||||
AgentAccessID: cfg.TimewebAgentAccessID,
|
||||
})
|
||||
if err != nil {
|
||||
metrics.RecordLLMError(req.ProviderID, "client_init_error")
|
||||
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
|
||||
}
|
||||
|
||||
@@ -171,6 +264,8 @@ func main() {
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
metrics.RecordLLMError(providerID, "stream_error")
|
||||
metrics.RecordSecurityEvent("llm_error", clientIP, userID)
|
||||
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
|
||||
}
|
||||
|
||||
@@ -179,14 +274,23 @@ func main() {
|
||||
|
||||
c.Context().SetBodyStreamWriter(func(w *bufio.Writer) {
|
||||
writer := ndjson.NewWriter(w)
|
||||
tokenCount := 0
|
||||
for chunk := range stream {
|
||||
writer.Write(fiber.Map{
|
||||
"type": "chunk",
|
||||
"chunk": chunk.ContentChunk,
|
||||
})
|
||||
w.Flush()
|
||||
tokenCount += len(chunk.ContentChunk) / 4
|
||||
}
|
||||
writer.Write(fiber.Map{"type": "done"})
|
||||
|
||||
metrics.RecordLLMLatency(providerID, modelKey, time.Since(startTime))
|
||||
metrics.RecordLLMTokens(providerID, tier, userID, tokenCount)
|
||||
|
||||
if usageRepo != nil {
|
||||
go usageRepo.IncrementLLMUsage(context.Background(), userID, tier, tokenCount)
|
||||
}
|
||||
})
|
||||
|
||||
return nil
|
||||
@@ -200,11 +304,16 @@ func main() {
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
metrics.RecordLLMError(providerID, "generate_error")
|
||||
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
|
||||
}
|
||||
|
||||
tokenCount := len(response) / 4
|
||||
metrics.RecordLLMLatency(providerID, modelKey, time.Since(startTime))
|
||||
metrics.RecordLLMTokens(providerID, tier, userID, tokenCount)
|
||||
|
||||
if usageRepo != nil {
|
||||
go usageRepo.IncrementLLMUsage(context.Background(), userID, tier, len(response)/4)
|
||||
go usageRepo.IncrementLLMUsage(context.Background(), userID, tier, tokenCount)
|
||||
}
|
||||
|
||||
return c.JSON(fiber.Map{
|
||||
@@ -213,7 +322,39 @@ func main() {
|
||||
})
|
||||
|
||||
llmAPI.Post("/embed", func(c *fiber.Ctx) error {
|
||||
return c.Status(501).JSON(fiber.Map{"error": "Not implemented"})
|
||||
userID := middleware.GetUserID(c)
|
||||
tier := middleware.GetUserTier(c)
|
||||
|
||||
if tier == "" {
|
||||
tier = "free"
|
||||
}
|
||||
|
||||
var req EmbedRequest
|
||||
if err := c.BodyParser(&req); err != nil {
|
||||
return c.Status(400).JSON(fiber.Map{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
if req.Input == "" {
|
||||
return c.Status(400).JSON(fiber.Map{"error": "Input text required"})
|
||||
}
|
||||
|
||||
model := req.Model
|
||||
if model == "" {
|
||||
model = cfg.OllamaEmbeddingModel
|
||||
}
|
||||
|
||||
embeddings, err := llm.GenerateEmbedding(cfg.OllamaBaseURL, model, req.Input)
|
||||
if err != nil {
|
||||
metrics.RecordLLMError("ollama", "embed_error")
|
||||
return c.Status(500).JSON(fiber.Map{"error": err.Error()})
|
||||
}
|
||||
|
||||
metrics.RecordLLMRequest("ollama", model, tier, userID)
|
||||
|
||||
return c.JSON(fiber.Map{
|
||||
"embedding": embeddings,
|
||||
"model": model,
|
||||
})
|
||||
})
|
||||
|
||||
port := cfg.LLMSvcPort
|
||||
@@ -223,8 +364,10 @@ func main() {
|
||||
|
||||
func getAPIKey(cfg *config.Config, providerID string) string {
|
||||
switch providerID {
|
||||
case "openai", "timeweb":
|
||||
case "openai":
|
||||
return cfg.OpenAIAPIKey
|
||||
case "timeweb":
|
||||
return cfg.TimewebAPIKey
|
||||
case "anthropic":
|
||||
return cfg.AnthropicAPIKey
|
||||
case "gemini", "google":
|
||||
@@ -234,6 +377,17 @@ func getAPIKey(cfg *config.Config, providerID string) string {
|
||||
}
|
||||
}
|
||||
|
||||
func getBaseURL(cfg *config.Config, providerID string) string {
|
||||
switch providerID {
|
||||
case "timeweb":
|
||||
return cfg.TimewebAPIBaseURL
|
||||
case "ollama":
|
||||
return cfg.OllamaBaseURL
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
if os.Getenv("PORT") == "" {
|
||||
os.Setenv("PORT", "3020")
|
||||
|
||||
Reference in New Issue
Block a user