feat: LLM routing by tier (free→Ollama, pro→Timeweb)

- Add tier-based provider routing in llm-svc - free tier → Ollama (local qwen3.5:9b) - pro/business → Timeweb Cloud AI - Add /api/v1/embed endpoint for embeddings via Ollama - Update Ollama client: qwen3.5:9b default, remove auth - Add GenerateEmbedding() function for qwen3-embedding:0.6b - Add Ollama K8s deployment with GPU support (RTX 4060 Ti) - Add monitoring stack (Prometheus, Grafana, Alertmanager) - Add Grafana dashboards for LLM and security metrics - Update deploy.sh with monitoring and Ollama deployment Made-with: Cursor
2026-03-03 02:25:22 +03:00
parent 5ac082a7c6
commit 7a40ff629e
19 changed files with 1759 additions and 35 deletions
--- a/backend/internal/llm/ollama.go
+++ b/backend/internal/llm/ollama.go
@@ -32,7 +32,7 @@ func NewOllamaClient(cfg OllamaConfig) (*OllamaClient, error) {

 	modelKey := cfg.ModelKey
 	if modelKey == "" {
-		modelKey = "llama3.2"
+		modelKey = "qwen3.5:9b"
 	}

 	return &OllamaClient{
@@ -231,3 +231,57 @@ func (c *OllamaClient) GenerateText(ctx context.Context, req StreamRequest) (str

 	return chatResp.Message.Content, nil
 }
+
+type ollamaEmbedRequest struct {
+	Model string `json:"model"`
+	Input string `json:"input"`
+}
+
+type ollamaEmbedResponse struct {
+	Model      string      `json:"model"`
+	Embeddings [][]float64 `json:"embeddings"`
+}
+
+func GenerateEmbedding(baseURL, model, input string) ([]float64, error) {
+	if baseURL == "" {
+		baseURL = "http://ollama:11434"
+	}
+	if model == "" {
+		model = "qwen3-embedding:0.6b"
+	}
+
+	embedReq := ollamaEmbedRequest{
+		Model: model,
+		Input: input,
+	}
+
+	body, err := json.Marshal(embedReq)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal embed request: %w", err)
+	}
+
+	url := fmt.Sprintf("%s/api/embed", baseURL)
+	httpClient := &http.Client{Timeout: 30 * time.Second}
+
+	resp, err := httpClient.Post(url, "application/json", bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("embed request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("Ollama embed API error: status %d, body: %s", resp.StatusCode, string(respBody))
+	}
+
+	var embedResp ollamaEmbedResponse
+	if err := json.NewDecoder(resp.Body).Decode(&embedResp); err != nil {
+		return nil, fmt.Errorf("failed to decode embed response: %w", err)
+	}
+
+	if len(embedResp.Embeddings) == 0 || len(embedResp.Embeddings[0]) == 0 {
+		return nil, errors.New("empty embeddings from Ollama")
+	}
+
+	return embedResp.Embeddings[0], nil
+}