feat: Go backend, enhanced search, new widgets, Docker deploy

Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
2026-02-27 04:15:32 +03:00
parent 328d968f3f
commit 06fe57c765
285 changed files with 53132 additions and 1871 deletions
--- a/services/cache-worker/Dockerfile
+++ b/services/cache-worker/Dockerfile
@@ -1,7 +1,9 @@
+# syntax=docker/dockerfile:1
 FROM node:22-alpine AS builder
 WORKDIR /app
 COPY package*.json ./
-RUN npm ci
+COPY --from=npm-cache / /tmp/npm-cache
+RUN npm install --cache /tmp/npm-cache --prefer-offline --no-audit
 COPY tsconfig.json ./
 COPY src ./src
 RUN npm run build
@@ -9,7 +11,8 @@ RUN npm run build
 FROM node:22-alpine
 WORKDIR /app
 COPY package*.json ./
-RUN npm ci --omit=dev
+COPY --from=npm-cache / /tmp/npm-cache
+RUN npm install --omit=dev --cache /tmp/npm-cache --prefer-offline --no-audit
 COPY --from=builder /app/dist ./dist
 EXPOSE 9090
 ENTRYPOINT ["node", "dist/run.js"]
--- a/services/cache-worker/package.json
+++ b/services/cache-worker/package.json
@@ -11,7 +11,9 @@
    "build": "tsc"
  },
  "dependencies": {
-    "ioredis": "^5.4.1"
+    "https-proxy-agent": "^7.0.6",
+    "ioredis": "^5.4.1",
+    "socks-proxy-agent": "^8.0.5"
  },
  "devDependencies": {
    "@types/node": "^22.10.0",
--- a/services/cache-worker/src/lib/crawl4ai-client.ts
+++ b/services/cache-worker/src/lib/crawl4ai-client.ts
@@ -0,0 +1,154 @@
+/**
+ * Crawl4AI REST API client for cache-worker with stealth support
+ */
+
+export interface CrawlResult {
+  url: string;
+  title: string;
+  content: string;
+  success: boolean;
+  statusCode?: number;
+}
+
+const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? '').replace(/\/$/, '');
+const TIMEOUT_MS = 45_000;
+const MAX_URLS_PER_BATCH = 15;
+
+const USER_AGENTS = [
+  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
+];
+
+function randomUserAgent(): string {
+  return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
+}
+
+function extractCrawl4aiMarkdown(md: unknown): string {
+  if (typeof md === 'string') return md;
+  if (md && typeof md === 'object') {
+    const obj = md as Record<string, unknown>;
+    if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
+    if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
+    if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0)
+      return obj.fit_markdown;
+  }
+  return '';
+}
+
+function stripHtmlToText(html: string): string {
+  return html
+    .replace(/<script[\s\S]*?<\/script>/gi, '')
+    .replace(/<style[\s\S]*?<\/style>/gi, '')
+    .replace(/<[^>]+>/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+    .slice(0, 50_000);
+}
+
+async function fetchFallback(url: string): Promise<CrawlResult> {
+  try {
+    const res = await fetch(url, {
+      signal: AbortSignal.timeout(15000),
+      headers: {
+        'User-Agent': randomUserAgent(),
+        Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
+      },
+    });
+    const html = await res.text();
+    const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
+    const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+    const body = bodyMatch ? bodyMatch[1] : html;
+    const content = stripHtmlToText(body);
+    return {
+      url,
+      title,
+      content: content || title,
+      success: res.ok,
+      statusCode: res.status,
+    };
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    return {
+      url,
+      title: `Error: ${url}`,
+      content: `Failed to fetch: ${msg}`,
+      success: false,
+    };
+  }
+}
+
+/**
+ * Crawl multiple URLs via Crawl4AI with stealth settings
+ */
+export async function crawlUrls(urls: string[]): Promise<CrawlResult[]> {
+  const list = urls.slice(0, MAX_URLS_PER_BATCH);
+  if (list.length === 0) return [];
+
+  if (!CRAWL4AI_URL) {
+    return Promise.all(list.map(fetchFallback));
+  }
+
+  try {
+    const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        urls: list,
+        crawler_config: {
+          type: 'CrawlerRunConfig',
+          params: {
+            cache_mode: 'bypass',
+            page_timeout: 30000,
+            user_agent: randomUserAgent(),
+            simulate_user: true,
+            override_navigator: true,
+          },
+        },
+      }),
+      signal: AbortSignal.timeout(TIMEOUT_MS),
+    });
+
+    if (!res.ok) {
+      throw new Error(`Crawl4AI HTTP ${res.status}`);
+    }
+
+    const data = (await res.json()) as {
+      results?: Array<{
+        url?: string;
+        success?: boolean;
+        markdown?: unknown;
+        metadata?: { title?: string };
+        status_code?: number;
+      }>;
+    };
+
+    const rawResults = data.results ?? [];
+    const byUrl = new Map<string, (typeof rawResults)[0]>();
+    for (const r of rawResults) {
+      const u = r.url ?? '';
+      if (u && !byUrl.has(u)) byUrl.set(u, r);
+    }
+
+    const results: CrawlResult[] = [];
+    for (const url of list) {
+      const r = byUrl.get(url);
+      const markdown = extractCrawl4aiMarkdown(r?.markdown);
+      if (r?.success && markdown.length > 100) {
+        results.push({
+          url,
+          title: r.metadata?.title ?? url,
+          content: markdown,
+          success: true,
+          statusCode: r.status_code,
+        });
+      } else {
+        results.push(await fetchFallback(url));
+      }
+    }
+    return results;
+  } catch {
+    return Promise.all(list.map(fetchFallback));
+  }
+}
--- a/services/cache-worker/src/run.ts
+++ b/services/cache-worker/src/run.ts
@@ -1,12 +1,15 @@
 /**
- * cache-worker — pre-compute discover, finance, travel
+ * cache-worker — pre-compute discover, finance, travel, digest-queue
 * docs/architecture: 03-cache-and-precompute-strategy.md
- * Cron: discover every 15m, finance every 2m, travel every 4h
- * Usage: node run.js --task=discover|finance|travel|all
+ * Cron: discover every 15m, finance every 2m, travel every 4h, digest-queue continuous
+ * Usage: node run.js --task=discover|finance|travel|digest-queue|all [--topic=tech|finance|art|sports|entertainment]
 */

 import Redis from 'ioredis';
-import { runDiscoverPrecompute } from './tasks/discover.js';
+import {
+  runDiscoverPrecompute,
+  type DiscoverTopic,
+} from './tasks/discover.js';
 import { runFinancePrecompute } from './tasks/finance.js';
 import { runTravelPrecompute } from './tasks/travel.js';

@@ -14,20 +17,165 @@ const REDIS_URL = process.env.REDIS_URL ?? 'redis://localhost:6379';
 const DISCOVER_SVC_URL = process.env.DISCOVER_SVC_URL ?? 'http://localhost:3002';
 const FINANCE_SVC_URL = process.env.FINANCE_SVC_URL ?? 'http://localhost:3003';
 const TRAVEL_SVC_URL = process.env.TRAVEL_SVC_URL ?? 'http://localhost:3004';
+const MASTER_AGENTS_SVC_URL = process.env.MASTER_AGENTS_SVC_URL ?? 'http://localhost:3006';
+const DIGEST_QUEUE_BATCH_SIZE = 5;
+const DIGEST_QUEUE_POLL_INTERVAL_MS = 5000;

 function getTask(): string {
  const idx = process.argv.indexOf('--task');
  return idx >= 0 && process.argv[idx + 1] ? process.argv[idx + 1] : 'all';
 }

+function getDiscoverTopic(): DiscoverTopic | undefined {
+  const idx = process.argv.indexOf('--topic');
+  const value = idx >= 0 ? process.argv[idx + 1] : undefined;
+  if (
+    value &&
+    ['tech', 'finance', 'art', 'sports', 'entertainment'].includes(value)
+  ) {
+    return value as DiscoverTopic;
+  }
+  return undefined;
+}
+
+async function processDigestQueueItem(
+  discoverBase: string,
+  item: { url: string; title: string },
+): Promise<boolean> {
+  console.log(`[digest-queue] Processing: ${item.url.slice(0, 60)}`);
+
+  try {
+    const res = await fetch(`${MASTER_AGENTS_SVC_URL.replace(/\/$/, '')}/api/v1/agents/search`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        message: {
+          messageId: crypto.randomUUID(),
+          chatId: 'digest-queue',
+          content: `Summary: ${item.url}`,
+        },
+        optimizationMode: 'balanced',
+        sources: ['web'],
+        history: [],
+        files: [],
+        chatModel: { providerId: 'env', key: 'default' },
+        systemInstructions: '',
+        locale: 'ru',
+        answerMode: 'standard',
+      }),
+      signal: AbortSignal.timeout(180000),
+      duplex: 'half',
+    } as RequestInit);
+
+    if (!res.ok) {
+      console.warn(`[digest-queue] Failed to generate summary for ${item.url}: HTTP ${res.status}`);
+      return false;
+    }
+
+    const events: string[] = [];
+    const reader = res.body?.getReader();
+    if (!reader) return false;
+
+    const decoder = new TextDecoder();
+    let buffer = '';
+
+    while (true) {
+      const { value, done } = await reader.read();
+      if (done) break;
+      buffer += decoder.decode(value, { stream: true });
+      const lines = buffer.split('\n');
+      buffer = lines.pop() ?? '';
+      for (const line of lines) {
+        if (line.trim()) events.push(line);
+      }
+    }
+    if (buffer.trim()) events.push(buffer);
+
+    if (events.length > 0) {
+      await fetch(`${discoverBase}/api/v1/discover/article-summary`, {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ url: item.url, events }),
+        signal: AbortSignal.timeout(30000),
+      });
+      console.log(`[digest-queue] Saved summary for ${item.url.slice(0, 60)}`);
+      return true;
+    }
+
+    return false;
+  } catch (err) {
+    console.error(`[digest-queue] Error processing ${item.url}:`, err);
+    return false;
+  }
+}
+
+async function runDigestQueueProcessor(discoverSvcUrl: string): Promise<void> {
+  const baseDiscover = discoverSvcUrl.replace(/\/$/, '');
+  let processed = 0;
+  let errors = 0;
+  const maxIterations = 100;
+
+  console.log('[digest-queue] Starting queue processor');
+
+  for (let i = 0; i < maxIterations; i++) {
+    try {
+      const res = await fetch(`${baseDiscover}/api/v1/discover/queue`, {
+        signal: AbortSignal.timeout(10000),
+      });
+
+      if (!res.ok) {
+        console.warn(`[digest-queue] Queue fetch failed: HTTP ${res.status}`);
+        await new Promise((r) => setTimeout(r, DIGEST_QUEUE_POLL_INTERVAL_MS));
+        continue;
+      }
+
+      const data = (await res.json()) as { item: { url: string; title: string } | null; queueLength: number };
+
+      if (!data.item) {
+        console.log(`[digest-queue] Queue empty, stopping`);
+        break;
+      }
+
+      console.log(`[digest-queue] Queue length: ${data.queueLength}`);
+
+      const success = await processDigestQueueItem(baseDiscover, data.item);
+
+      await fetch(`${baseDiscover}/api/v1/discover/queue?url=${encodeURIComponent(data.item.url)}&requeue=${!success}`, {
+        method: 'DELETE',
+        signal: AbortSignal.timeout(5000),
+      }).catch(() => {});
+
+      if (success) {
+        processed++;
+      } else {
+        errors++;
+      }
+
+      if (errors > 10) {
+        console.warn('[digest-queue] Too many errors, stopping');
+        break;
+      }
+
+    } catch (err) {
+      console.error('[digest-queue] Iteration error:', err);
+      errors++;
+      await new Promise((r) => setTimeout(r, DIGEST_QUEUE_POLL_INTERVAL_MS));
+    }
+  }
+
+  console.log(`[digest-queue] Done: ${processed} processed, ${errors} errors`);
+}
+
 async function main() {
  const task = getTask();
+  const discoverTopic = getDiscoverTopic();
  // @ts-expect-error — ioredis + NodeNext ESM constructability
  const redis = new Redis(REDIS_URL);
  try {
    if (task === 'discover' || task === 'all') {
-      console.log('[cache-worker] Running discover precompute...');
-      await runDiscoverPrecompute(redis, DISCOVER_SVC_URL);
+      const label = discoverTopic ? `[${discoverTopic}]` : '';
+      console.log(`[cache-worker] Running discover precompute ${label}...`);
+      await runDiscoverPrecompute(redis, DISCOVER_SVC_URL, discoverTopic);
    }
    if (task === 'finance' || task === 'all') {
      console.log('[cache-worker] Running finance precompute...');
@@ -37,6 +185,10 @@ async function main() {
      console.log('[cache-worker] Running travel precompute...');
      await runTravelPrecompute(redis, TRAVEL_SVC_URL);
    }
+    if (task === 'digest-queue') {
+      console.log('[cache-worker] Running digest queue processor...');
+      await runDigestQueueProcessor(DISCOVER_SVC_URL);
+    }
    console.log('[cache-worker] Done');
  } finally {
    await redis.quit();
--- a/services/cache-worker/src/tasks/discover.ts
+++ b/services/cache-worker/src/tasks/discover.ts
@@ -1,30 +1,519 @@
 /**
- * Discover pre-compute: вызывает discover-svc для всех тем, заполняет Redis
- * Redis: discover:{topic} TTL 30 min
+ * Discover pre-compute: Perplexity-style multi-source digests.
+ *
+ * Pipeline per topic+region:
+ *   1. Fetch raw articles from discover-svc (SearXNG)
+ *   2. Cluster articles by topic via LLM
+ *   3. Scrape all URLs in each cluster via Crawl4AI
+ *   4. Synthesize multi-source summary with inline citations via LLM
+ *   5. Save digest to discover-svc (SQLite + Redis invalidation)
+ *
+ * Fallback: if Crawl4AI is unavailable, falls back to basic fetch+strip.
+ * Fallback: if clustering LLM fails, treat all articles as one cluster.
 */

 import type { Redis as RedisType } from 'ioredis';

-const TOPICS = ['tech', 'finance', 'travel', 'world', 'science'];
+export const DISCOVER_TOPICS = ['tech', 'finance', 'art', 'sports', 'entertainment'] as const;
+export type DiscoverTopic = (typeof DISCOVER_TOPICS)[number];
+const REGIONS = ['america', 'eu', 'world'] as const;
+const MAX_RAW_ARTICLES = 20;
+const MAX_CLUSTERS = 5;
+const MAX_URLS_PER_CLUSTER = 5;
+const CRAWL4AI_TIMEOUT_MS = 25_000;
+const LLM_TIMEOUT_MS = 120_000;
+const LLM_RETRY_ATTEMPTS = 3;
+const LLM_RETRY_DELAY_MS = 5000;
+const MAX_PARALLEL_SYNTHESIS = 3;
+const MAX_PARALLEL_TOPIC_REGIONS = 2;

-export async function runDiscoverPrecompute(
-  redis: RedisType,
-  discoverSvcUrl: string
-): Promise<void> {
-  for (const topic of TOPICS) {
+const LLM_SVC_URL = process.env.LLM_SVC_URL?.trim() ?? '';
+const LLM_DISCOVER_PROVIDER = process.env.LLM_DISCOVER_PROVIDER?.trim() || 'env';
+const LLM_DISCOVER_MODEL = process.env.LLM_DISCOVER_MODEL?.trim() || 'gpt-4o-mini';
+const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
+
+interface RawBlog {
+  title: string;
+  content?: string;
+  url: string;
+  thumbnail?: string;
+}
+
+interface ClusterDef {
+  clusterTitle: string;
+  articleIndices: number[];
+  mainIndex: number;
+}
+
+interface DigestCitation {
+  index: number;
+  url: string;
+  title: string;
+  domain: string;
+}
+
+interface SynthesisResult {
+  summaryRu: string;
+  citations: DigestCitation[];
+  followUp: string[];
+  shortDescription: string;
+}
+
+interface ScrapedArticle {
+  url: string;
+  title: string;
+  markdown: string;
+  thumbnail: string;
+}
+
+function extractCrawl4aiMarkdown(md: unknown): string {
+  if (typeof md === 'string') return md;
+  if (md && typeof md === 'object') {
+    const obj = md as Record<string, unknown>;
+    if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
+    if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
+    if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
+  }
+  return '';
+}
+
+function extractDomain(url: string): string {
+  try {
+    return new URL(url).hostname.replace(/^www\./, '');
+  } catch {
+    return url;
+  }
+}
+
+class Semaphore {
+  private permits: number;
+  private queue: (() => void)[] = [];
+
+  constructor(permits: number) {
+    this.permits = permits;
+  }
+
+  async acquire(): Promise<void> {
+    if (this.permits > 0) {
+      this.permits--;
+      return;
+    }
+    return new Promise((resolve) => {
+      this.queue.push(resolve);
+    });
+  }
+
+  release(): void {
+    const next = this.queue.shift();
+    if (next) {
+      next();
+    } else {
+      this.permits++;
+    }
+  }
+
+  async runExclusive<T>(fn: () => Promise<T>): Promise<T> {
+    await this.acquire();
    try {
-      const url = `${discoverSvcUrl.replace(/\/$/, '')}/api/v1/discover?topic=${topic}`;
-      const res = await fetch(url, { signal: AbortSignal.timeout(60000) });
-      if (!res.ok) {
-        console.warn(`[discover] ${topic}: HTTP ${res.status}`);
-        continue;
-      }
-      const data = await res.json();
-      const key = `discover:${topic}`;
-      await redis.setex(key, 30 * 60, JSON.stringify(data));
-      console.log(`[discover] ${topic}: cached ${data?.items?.length ?? 0} items`);
-    } catch (err) {
-      console.error(`[discover] ${topic}:`, err);
+      return await fn();
+    } finally {
+      this.release();
    }
  }
 }
+
+async function scrapeViaCrawl4ai(url: string): Promise<{ title: string; markdown: string }> {
+  const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({
+      urls: [url],
+      crawler_config: {
+        type: 'CrawlerRunConfig',
+        params: { cache_mode: 'default', page_timeout: 15000 },
+      },
+    }),
+    signal: AbortSignal.timeout(CRAWL4AI_TIMEOUT_MS),
+  });
+  if (!res.ok) throw new Error(`Crawl4AI HTTP ${res.status}`);
+  const data = (await res.json()) as {
+    results?: { url: string; markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
+  };
+  const first = data.results?.[0];
+  const md = extractCrawl4aiMarkdown(first?.markdown);
+  if (!first?.success || !md) throw new Error('Crawl4AI empty result');
+  return { title: first.metadata?.title ?? '', markdown: md.slice(0, 10000) };
+}
+
+async function scrapeFallback(url: string): Promise<{ title: string; markdown: string }> {
+  const res = await fetch(url, {
+    signal: AbortSignal.timeout(10000),
+    headers: { 'User-Agent': 'GooSeek-CacheWorker/1.0' },
+  });
+  const html = await res.text();
+  const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? '';
+  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+  const body = bodyMatch ? bodyMatch[1] : html;
+  const markdown = body
+    .replace(/<script[\s\S]*?<\/script>/gi, '')
+    .replace(/<style[\s\S]*?<\/style>/gi, '')
+    .replace(/<[^>]+>/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+    .slice(0, 10000);
+  return { title, markdown };
+}
+
+async function scrapeArticle(blog: RawBlog): Promise<ScrapedArticle> {
+  let title = blog.title || blog.url;
+  let markdown = (blog.content ?? '').trim();
+
+  try {
+    const scraped = await scrapeViaCrawl4ai(blog.url);
+    if (scraped.markdown.length > 100) {
+      markdown = scraped.markdown;
+      if (scraped.title) title = scraped.title;
+    }
+  } catch {
+    try {
+      const fallback = await scrapeFallback(blog.url);
+      if (fallback.markdown.length > 100) {
+        markdown = fallback.markdown;
+        if (fallback.title) title = fallback.title;
+      }
+    } catch {
+      // keep original content
+    }
+  }
+
+  return { url: blog.url, title, markdown, thumbnail: blog.thumbnail ?? '' };
+}
+
+let totalInputChars = 0;
+let totalOutputChars = 0;
+let totalLlmCalls = 0;
+
+function estimateTokens(chars: number): number {
+  return Math.ceil(chars / 4);
+}
+
+async function callLlm(
+  systemPrompt: string,
+  userPrompt: string,
+  timeoutMs: number = LLM_TIMEOUT_MS,
+): Promise<string> {
+  if (!LLM_SVC_URL) throw new Error('LLM_SVC_URL not set');
+  const base = LLM_SVC_URL.replace(/\/$/, '');
+  const inputChars = systemPrompt.length + userPrompt.length;
+
+  const res = await fetch(`${base}/api/v1/generate`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({
+      model: { providerId: LLM_DISCOVER_PROVIDER, key: LLM_DISCOVER_MODEL },
+      messages: [
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: userPrompt },
+      ],
+    }),
+    signal: AbortSignal.timeout(timeoutMs),
+  });
+  if (!res.ok) throw new Error(`LLM HTTP ${res.status}`);
+  const data = (await res.json()) as { content?: string };
+  const output = (data.content ?? '').trim();
+
+  totalInputChars += inputChars;
+  totalOutputChars += output.length;
+  totalLlmCalls++;
+
+  return output;
+}
+
+function logTokenUsage(): void {
+  const inputTokens = estimateTokens(totalInputChars);
+  const outputTokens = estimateTokens(totalOutputChars);
+  const costInput = (inputTokens / 1_000_000) * 0.15;
+  const costOutput = (outputTokens / 1_000_000) * 0.60;
+  console.log(`[discover] Token usage: ~${inputTokens} input, ~${outputTokens} output (${totalLlmCalls} calls)`);
+  console.log(`[discover] Estimated cost (gpt-4o-mini): $${(costInput + costOutput).toFixed(4)}`);
+}
+
+function parseJsonFromLlm<T>(raw: string): T {
+  const cleaned = raw.replace(/^```json?\s*/i, '').replace(/\s*```$/i, '').trim();
+  return JSON.parse(cleaned) as T;
+}
+
+async function clusterArticles(articles: RawBlog[]): Promise<ClusterDef[]> {
+  const listing = articles
+    .map((a, i) => `[${i}] ${a.title} — ${a.url}`)
+    .join('\n');
+
+  const systemPrompt = 'Ты группируешь новости по темам. Возвращаешь ТОЛЬКО валидный JSON массив без markdown.';
+  const userPrompt = `Сгруппируй эти ${articles.length} новостей по темам/историям. Каждая группа — одна "история" или тема.
+Объедини похожие/дублирующиеся новости в одну группу.
+Верни JSON массив: [{"clusterTitle": "Краткий заголовок темы на русском", "articleIndices": [0,2,5], "mainIndex": 0}]
+Максимум ${MAX_CLUSTERS} групп. mainIndex — индекс главной/лучшей статьи в группе.
+
+Статьи:
+${listing}`;
+
+  for (let attempt = 0; attempt < LLM_RETRY_ATTEMPTS; attempt++) {
+    try {
+      if (attempt > 0) await new Promise((r) => setTimeout(r, LLM_RETRY_DELAY_MS));
+      const raw = await callLlm(systemPrompt, userPrompt);
+      const clusters = parseJsonFromLlm<ClusterDef[]>(raw);
+      if (Array.isArray(clusters) && clusters.length > 0) {
+        return clusters.slice(0, MAX_CLUSTERS);
+      }
+    } catch (e) {
+      console.warn(`[discover] cluster attempt ${attempt + 1} failed:`, e);
+    }
+  }
+
+  return [{
+    clusterTitle: articles[0]?.title ?? 'Новости',
+    articleIndices: articles.map((_, i) => i),
+    mainIndex: 0,
+  }];
+}
+
+async function synthesizeDigest(
+  clusterTitle: string,
+  scrapedArticles: ScrapedArticle[],
+): Promise<SynthesisResult | null> {
+  const sourcesListing = scrapedArticles
+    .map((a, i) => `[${i + 1}] ${a.title}\n${a.url}\n${a.markdown.slice(0, 1500)}`)
+    .join('\n---\n');
+
+  const systemPrompt = 'Синтезируй новости в сводку на русском. JSON без markdown.';
+
+  const userPrompt = `Тема: "${clusterTitle}"
+
+${sourcesListing}
+
+Синтезируй в сводку 3-5 абзацев. Ссылки [N] на источники. Русский язык.
+JSON: {"summaryRu":"...[1][2]...","citations":[{"index":1,"url":"","title":"","domain":""}],"followUp":["?","?"],"shortDescription":"..."}`;
+
+  for (let attempt = 0; attempt < LLM_RETRY_ATTEMPTS; attempt++) {
+    try {
+      if (attempt > 0) await new Promise((r) => setTimeout(r, LLM_RETRY_DELAY_MS));
+      const raw = await callLlm(systemPrompt, userPrompt, 180_000);
+      const result = parseJsonFromLlm<SynthesisResult>(raw);
+      if (result.summaryRu && result.citations) {
+        if (!result.citations.length) {
+          result.citations = scrapedArticles.map((a, i) => ({
+            index: i + 1,
+            url: a.url,
+            title: a.title,
+            domain: extractDomain(a.url),
+          }));
+        }
+        if (!result.followUp?.length) {
+          result.followUp = [];
+        }
+        if (!result.shortDescription) {
+          result.shortDescription = result.summaryRu.slice(0, 200) + '…';
+        }
+        return result;
+      }
+    } catch (e) {
+      console.warn(`[discover] synthesis attempt ${attempt + 1} failed for "${clusterTitle}":`, e);
+    }
+  }
+  return null;
+}
+
+async function saveDigest(
+  discoverBase: string,
+  payload: {
+    topic: string;
+    region: string;
+    clusterTitle: string;
+    summaryRu: string;
+    citations: DigestCitation[];
+    sourcesCount: number;
+    followUp: string[];
+    thumbnail: string;
+    shortDescription: string;
+    mainUrl: string;
+  },
+): Promise<boolean> {
+  try {
+    const res = await fetch(`${discoverBase}/api/v1/discover/digest`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(payload),
+      signal: AbortSignal.timeout(10000),
+    });
+    return res.status === 204;
+  } catch (e) {
+    console.error(`[discover] save digest failed:`, e);
+    return false;
+  }
+}
+
+interface ClusterProcessingData {
+  cluster: ClusterDef;
+  clusterBlogs: RawBlog[];
+  topic: string;
+  region: string;
+}
+
+async function processTopicRegion(
+  baseDiscover: string,
+  topic: string,
+  region: string,
+  synthesisSemaphore: Semaphore,
+): Promise<{ saved: number; total: number }> {
+  console.log(`[discover] ${topic}:${region} — starting digest pipeline`);
+
+  const rawUrl = `${baseDiscover}/api/v1/discover?topic=${topic}&region=${region}&mode=raw`;
+  const rawRes = await fetch(rawUrl, { signal: AbortSignal.timeout(60000) });
+  if (!rawRes.ok) {
+    console.warn(`[discover] ${topic}:${region} raw HTTP ${rawRes.status}`);
+    return { saved: 0, total: 0 };
+  }
+  const rawData = (await rawRes.json()) as { blogs?: RawBlog[] };
+  const blogs = Array.isArray(rawData?.blogs) ? rawData.blogs : [];
+  if (blogs.length === 0) {
+    console.log(`[discover] ${topic}:${region} — no raw articles`);
+    return { saved: 0, total: 0 };
+  }
+
+  const articlesToProcess = blogs.slice(0, MAX_RAW_ARTICLES);
+  console.log(`[discover] ${topic}:${region} — ${articlesToProcess.length} raw articles`);
+
+  const clusters = await clusterArticles(articlesToProcess);
+  console.log(`[discover] ${topic}:${region} — ${clusters.length} clusters`);
+
+  const clusterData: ClusterProcessingData[] = clusters
+    .map((cluster) => {
+      const clusterBlogs = cluster.articleIndices
+        .filter((i) => i >= 0 && i < articlesToProcess.length)
+        .slice(0, MAX_URLS_PER_CLUSTER)
+        .map((i) => articlesToProcess[i]);
+      return { cluster, clusterBlogs, topic, region };
+    })
+    .filter((d) => d.clusterBlogs.length > 0);
+
+  const processCluster = async (data: ClusterProcessingData): Promise<boolean> => {
+    const { cluster, clusterBlogs, topic, region } = data;
+
+    const mainIdx = cluster.mainIndex >= 0 && cluster.mainIndex < clusterBlogs.length
+      ? cluster.mainIndex
+      : 0;
+    const mainBlog = clusterBlogs[mainIdx];
+
+    try {
+      const checkRes = await fetch(
+        `${baseDiscover}/api/v1/discover/digest?url=${encodeURIComponent(mainBlog.url)}`,
+        { signal: AbortSignal.timeout(5000) },
+      );
+      if (checkRes.ok) {
+        console.log(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — digest exists, skipping`);
+        return true;
+      }
+    } catch {
+      // continue with synthesis
+    }
+
+    const scrapeResults = await Promise.allSettled(
+      clusterBlogs.map((blog) => scrapeArticle(blog)),
+    );
+    const scraped = scrapeResults
+      .filter((r): r is PromiseFulfilledResult<ScrapedArticle> => r.status === 'fulfilled')
+      .map((r) => r.value)
+      .filter((a) => a.markdown.length > 50);
+
+    if (scraped.length === 0) {
+      console.warn(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — no scraped content`);
+      return false;
+    }
+
+    console.log(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — ${scraped.length} scraped`);
+
+    const synthesis = await synthesisSemaphore.runExclusive(() =>
+      synthesizeDigest(cluster.clusterTitle, scraped),
+    );
+
+    if (!synthesis) {
+      console.warn(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — synthesis failed`);
+      return false;
+    }
+
+    const mainScraped = scraped.find((s) => s.url === mainBlog.url);
+    const thumbnail = mainScraped?.thumbnail || mainBlog.thumbnail || '';
+
+    return saveDigest(baseDiscover, {
+      topic,
+      region,
+      clusterTitle: cluster.clusterTitle,
+      summaryRu: synthesis.summaryRu,
+      citations: synthesis.citations,
+      sourcesCount: scraped.length,
+      followUp: synthesis.followUp,
+      thumbnail,
+      shortDescription: synthesis.shortDescription,
+      mainUrl: mainBlog.url,
+    });
+  };
+
+  const results = await Promise.allSettled(
+    clusterData.map((data) => processCluster(data)),
+  );
+
+  const savedCount = results.filter(
+    (r) => r.status === 'fulfilled' && r.value === true,
+  ).length;
+
+  console.log(`[discover] ${topic}:${region} — ${savedCount}/${clusters.length} digests saved`);
+  return { saved: savedCount, total: clusters.length };
+}
+
+export async function runDiscoverPrecompute(
+  _redis: RedisType,
+  discoverSvcUrl: string,
+  topicFilter?: DiscoverTopic,
+): Promise<void> {
+  totalInputChars = 0;
+  totalOutputChars = 0;
+  totalLlmCalls = 0;
+
+  const baseDiscover = discoverSvcUrl.replace(/\/$/, '');
+  const topics = topicFilter && DISCOVER_TOPICS.includes(topicFilter)
+    ? [topicFilter]
+    : [...DISCOVER_TOPICS];
+
+  const topicRegionPairs: { topic: string; region: string }[] = [];
+  for (const topic of topics) {
+    for (const region of REGIONS) {
+      topicRegionPairs.push({ topic, region });
+    }
+  }
+
+  const topicRegionSemaphore = new Semaphore(MAX_PARALLEL_TOPIC_REGIONS);
+  const synthesisSemaphore = new Semaphore(MAX_PARALLEL_SYNTHESIS);
+
+  const startTime = Date.now();
+  console.log(`[discover] Starting parallel processing of ${topicRegionPairs.length} topic/region pairs`);
+
+  const results = await Promise.allSettled(
+    topicRegionPairs.map(({ topic, region }) =>
+      topicRegionSemaphore.runExclusive(() =>
+        processTopicRegion(baseDiscover, topic, region, synthesisSemaphore),
+      ),
+    ),
+  );
+
+  const totalSaved = results
+    .filter((r): r is PromiseFulfilledResult<{ saved: number; total: number }> => r.status === 'fulfilled')
+    .reduce((sum, r) => sum + r.value.saved, 0);
+
+  const totalClusters = results
+    .filter((r): r is PromiseFulfilledResult<{ saved: number; total: number }> => r.status === 'fulfilled')
+    .reduce((sum, r) => sum + r.value.total, 0);
+
+  const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1);
+  console.log(`[discover] Completed: ${totalSaved}/${totalClusters} digests saved in ${elapsedSec}s`);
+  logTokenUsage();
+}