feat: Go backend, enhanced search, new widgets, Docker deploy

Major changes:
- Add Go backend (backend/) with microservices architecture
- Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler,
  proxy-manager, media-search, fastClassifier, language detection
- New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard,
  UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions
- Improved discover-svc with discover-db integration
- Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md)
- Library-svc: project_id schema migration
- Remove deprecated finance-svc and travel-svc
- Localization improvements across services

Made-with: Cursor
This commit is contained in:
home
2026-02-27 04:15:32 +03:00
parent 328d968f3f
commit 06fe57c765
285 changed files with 53132 additions and 1871 deletions

View File

@@ -1,7 +1,9 @@
# syntax=docker/dockerfile:1
FROM node:22-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY --from=npm-cache / /tmp/npm-cache
RUN npm install --cache /tmp/npm-cache --prefer-offline --no-audit
COPY tsconfig.json ./
COPY src ./src
RUN npm run build
@@ -9,7 +11,8 @@ RUN npm run build
FROM node:22-alpine
WORKDIR /app
COPY package*.json ./
RUN npm ci --omit=dev
COPY --from=npm-cache / /tmp/npm-cache
RUN npm install --omit=dev --cache /tmp/npm-cache --prefer-offline --no-audit
COPY --from=builder /app/dist ./dist
EXPOSE 9090
ENTRYPOINT ["node", "dist/run.js"]

View File

@@ -11,7 +11,9 @@
"build": "tsc"
},
"dependencies": {
"ioredis": "^5.4.1"
"https-proxy-agent": "^7.0.6",
"ioredis": "^5.4.1",
"socks-proxy-agent": "^8.0.5"
},
"devDependencies": {
"@types/node": "^22.10.0",

View File

@@ -0,0 +1,154 @@
/**
* Crawl4AI REST API client for cache-worker with stealth support
*/
export interface CrawlResult {
url: string;
title: string;
content: string;
success: boolean;
statusCode?: number;
}
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? '').replace(/\/$/, '');
const TIMEOUT_MS = 45_000;
const MAX_URLS_PER_BATCH = 15;
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
];
function randomUserAgent(): string {
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
}
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0)
return obj.fit_markdown;
}
return '';
}
function stripHtmlToText(html: string): string {
return html
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 50_000);
}
async function fetchFallback(url: string): Promise<CrawlResult> {
try {
const res = await fetch(url, {
signal: AbortSignal.timeout(15000),
headers: {
'User-Agent': randomUserAgent(),
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
},
});
const html = await res.text();
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
const content = stripHtmlToText(body);
return {
url,
title,
content: content || title,
success: res.ok,
statusCode: res.status,
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return {
url,
title: `Error: ${url}`,
content: `Failed to fetch: ${msg}`,
success: false,
};
}
}
/**
* Crawl multiple URLs via Crawl4AI with stealth settings
*/
export async function crawlUrls(urls: string[]): Promise<CrawlResult[]> {
const list = urls.slice(0, MAX_URLS_PER_BATCH);
if (list.length === 0) return [];
if (!CRAWL4AI_URL) {
return Promise.all(list.map(fetchFallback));
}
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: list,
crawler_config: {
type: 'CrawlerRunConfig',
params: {
cache_mode: 'bypass',
page_timeout: 30000,
user_agent: randomUserAgent(),
simulate_user: true,
override_navigator: true,
},
},
}),
signal: AbortSignal.timeout(TIMEOUT_MS),
});
if (!res.ok) {
throw new Error(`Crawl4AI HTTP ${res.status}`);
}
const data = (await res.json()) as {
results?: Array<{
url?: string;
success?: boolean;
markdown?: unknown;
metadata?: { title?: string };
status_code?: number;
}>;
};
const rawResults = data.results ?? [];
const byUrl = new Map<string, (typeof rawResults)[0]>();
for (const r of rawResults) {
const u = r.url ?? '';
if (u && !byUrl.has(u)) byUrl.set(u, r);
}
const results: CrawlResult[] = [];
for (const url of list) {
const r = byUrl.get(url);
const markdown = extractCrawl4aiMarkdown(r?.markdown);
if (r?.success && markdown.length > 100) {
results.push({
url,
title: r.metadata?.title ?? url,
content: markdown,
success: true,
statusCode: r.status_code,
});
} else {
results.push(await fetchFallback(url));
}
}
return results;
} catch {
return Promise.all(list.map(fetchFallback));
}
}

View File

@@ -1,12 +1,15 @@
/**
* cache-worker — pre-compute discover, finance, travel
* cache-worker — pre-compute discover, finance, travel, digest-queue
* docs/architecture: 03-cache-and-precompute-strategy.md
* Cron: discover every 15m, finance every 2m, travel every 4h
* Usage: node run.js --task=discover|finance|travel|all
* Cron: discover every 15m, finance every 2m, travel every 4h, digest-queue continuous
* Usage: node run.js --task=discover|finance|travel|digest-queue|all [--topic=tech|finance|art|sports|entertainment]
*/
import Redis from 'ioredis';
import { runDiscoverPrecompute } from './tasks/discover.js';
import {
runDiscoverPrecompute,
type DiscoverTopic,
} from './tasks/discover.js';
import { runFinancePrecompute } from './tasks/finance.js';
import { runTravelPrecompute } from './tasks/travel.js';
@@ -14,20 +17,165 @@ const REDIS_URL = process.env.REDIS_URL ?? 'redis://localhost:6379';
const DISCOVER_SVC_URL = process.env.DISCOVER_SVC_URL ?? 'http://localhost:3002';
const FINANCE_SVC_URL = process.env.FINANCE_SVC_URL ?? 'http://localhost:3003';
const TRAVEL_SVC_URL = process.env.TRAVEL_SVC_URL ?? 'http://localhost:3004';
const MASTER_AGENTS_SVC_URL = process.env.MASTER_AGENTS_SVC_URL ?? 'http://localhost:3006';
const DIGEST_QUEUE_BATCH_SIZE = 5;
const DIGEST_QUEUE_POLL_INTERVAL_MS = 5000;
function getTask(): string {
const idx = process.argv.indexOf('--task');
return idx >= 0 && process.argv[idx + 1] ? process.argv[idx + 1] : 'all';
}
function getDiscoverTopic(): DiscoverTopic | undefined {
const idx = process.argv.indexOf('--topic');
const value = idx >= 0 ? process.argv[idx + 1] : undefined;
if (
value &&
['tech', 'finance', 'art', 'sports', 'entertainment'].includes(value)
) {
return value as DiscoverTopic;
}
return undefined;
}
async function processDigestQueueItem(
discoverBase: string,
item: { url: string; title: string },
): Promise<boolean> {
console.log(`[digest-queue] Processing: ${item.url.slice(0, 60)}`);
try {
const res = await fetch(`${MASTER_AGENTS_SVC_URL.replace(/\/$/, '')}/api/v1/agents/search`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
message: {
messageId: crypto.randomUUID(),
chatId: 'digest-queue',
content: `Summary: ${item.url}`,
},
optimizationMode: 'balanced',
sources: ['web'],
history: [],
files: [],
chatModel: { providerId: 'env', key: 'default' },
systemInstructions: '',
locale: 'ru',
answerMode: 'standard',
}),
signal: AbortSignal.timeout(180000),
duplex: 'half',
} as RequestInit);
if (!res.ok) {
console.warn(`[digest-queue] Failed to generate summary for ${item.url}: HTTP ${res.status}`);
return false;
}
const events: string[] = [];
const reader = res.body?.getReader();
if (!reader) return false;
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const { value, done } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop() ?? '';
for (const line of lines) {
if (line.trim()) events.push(line);
}
}
if (buffer.trim()) events.push(buffer);
if (events.length > 0) {
await fetch(`${discoverBase}/api/v1/discover/article-summary`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url: item.url, events }),
signal: AbortSignal.timeout(30000),
});
console.log(`[digest-queue] Saved summary for ${item.url.slice(0, 60)}`);
return true;
}
return false;
} catch (err) {
console.error(`[digest-queue] Error processing ${item.url}:`, err);
return false;
}
}
async function runDigestQueueProcessor(discoverSvcUrl: string): Promise<void> {
const baseDiscover = discoverSvcUrl.replace(/\/$/, '');
let processed = 0;
let errors = 0;
const maxIterations = 100;
console.log('[digest-queue] Starting queue processor');
for (let i = 0; i < maxIterations; i++) {
try {
const res = await fetch(`${baseDiscover}/api/v1/discover/queue`, {
signal: AbortSignal.timeout(10000),
});
if (!res.ok) {
console.warn(`[digest-queue] Queue fetch failed: HTTP ${res.status}`);
await new Promise((r) => setTimeout(r, DIGEST_QUEUE_POLL_INTERVAL_MS));
continue;
}
const data = (await res.json()) as { item: { url: string; title: string } | null; queueLength: number };
if (!data.item) {
console.log(`[digest-queue] Queue empty, stopping`);
break;
}
console.log(`[digest-queue] Queue length: ${data.queueLength}`);
const success = await processDigestQueueItem(baseDiscover, data.item);
await fetch(`${baseDiscover}/api/v1/discover/queue?url=${encodeURIComponent(data.item.url)}&requeue=${!success}`, {
method: 'DELETE',
signal: AbortSignal.timeout(5000),
}).catch(() => {});
if (success) {
processed++;
} else {
errors++;
}
if (errors > 10) {
console.warn('[digest-queue] Too many errors, stopping');
break;
}
} catch (err) {
console.error('[digest-queue] Iteration error:', err);
errors++;
await new Promise((r) => setTimeout(r, DIGEST_QUEUE_POLL_INTERVAL_MS));
}
}
console.log(`[digest-queue] Done: ${processed} processed, ${errors} errors`);
}
async function main() {
const task = getTask();
const discoverTopic = getDiscoverTopic();
// @ts-expect-error — ioredis + NodeNext ESM constructability
const redis = new Redis(REDIS_URL);
try {
if (task === 'discover' || task === 'all') {
console.log('[cache-worker] Running discover precompute...');
await runDiscoverPrecompute(redis, DISCOVER_SVC_URL);
const label = discoverTopic ? `[${discoverTopic}]` : '';
console.log(`[cache-worker] Running discover precompute ${label}...`);
await runDiscoverPrecompute(redis, DISCOVER_SVC_URL, discoverTopic);
}
if (task === 'finance' || task === 'all') {
console.log('[cache-worker] Running finance precompute...');
@@ -37,6 +185,10 @@ async function main() {
console.log('[cache-worker] Running travel precompute...');
await runTravelPrecompute(redis, TRAVEL_SVC_URL);
}
if (task === 'digest-queue') {
console.log('[cache-worker] Running digest queue processor...');
await runDigestQueueProcessor(DISCOVER_SVC_URL);
}
console.log('[cache-worker] Done');
} finally {
await redis.quit();

View File

@@ -1,30 +1,519 @@
/**
* Discover pre-compute: вызывает discover-svc для всех тем, заполняет Redis
* Redis: discover:{topic} TTL 30 min
* Discover pre-compute: Perplexity-style multi-source digests.
*
* Pipeline per topic+region:
* 1. Fetch raw articles from discover-svc (SearXNG)
* 2. Cluster articles by topic via LLM
* 3. Scrape all URLs in each cluster via Crawl4AI
* 4. Synthesize multi-source summary with inline citations via LLM
* 5. Save digest to discover-svc (SQLite + Redis invalidation)
*
* Fallback: if Crawl4AI is unavailable, falls back to basic fetch+strip.
* Fallback: if clustering LLM fails, treat all articles as one cluster.
*/
import type { Redis as RedisType } from 'ioredis';
const TOPICS = ['tech', 'finance', 'travel', 'world', 'science'];
export const DISCOVER_TOPICS = ['tech', 'finance', 'art', 'sports', 'entertainment'] as const;
export type DiscoverTopic = (typeof DISCOVER_TOPICS)[number];
const REGIONS = ['america', 'eu', 'world'] as const;
const MAX_RAW_ARTICLES = 20;
const MAX_CLUSTERS = 5;
const MAX_URLS_PER_CLUSTER = 5;
const CRAWL4AI_TIMEOUT_MS = 25_000;
const LLM_TIMEOUT_MS = 120_000;
const LLM_RETRY_ATTEMPTS = 3;
const LLM_RETRY_DELAY_MS = 5000;
const MAX_PARALLEL_SYNTHESIS = 3;
const MAX_PARALLEL_TOPIC_REGIONS = 2;
export async function runDiscoverPrecompute(
redis: RedisType,
discoverSvcUrl: string
): Promise<void> {
for (const topic of TOPICS) {
const LLM_SVC_URL = process.env.LLM_SVC_URL?.trim() ?? '';
const LLM_DISCOVER_PROVIDER = process.env.LLM_DISCOVER_PROVIDER?.trim() || 'env';
const LLM_DISCOVER_MODEL = process.env.LLM_DISCOVER_MODEL?.trim() || 'gpt-4o-mini';
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
interface RawBlog {
title: string;
content?: string;
url: string;
thumbnail?: string;
}
interface ClusterDef {
clusterTitle: string;
articleIndices: number[];
mainIndex: number;
}
interface DigestCitation {
index: number;
url: string;
title: string;
domain: string;
}
interface SynthesisResult {
summaryRu: string;
citations: DigestCitation[];
followUp: string[];
shortDescription: string;
}
interface ScrapedArticle {
url: string;
title: string;
markdown: string;
thumbnail: string;
}
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
}
return '';
}
function extractDomain(url: string): string {
try {
return new URL(url).hostname.replace(/^www\./, '');
} catch {
return url;
}
}
class Semaphore {
private permits: number;
private queue: (() => void)[] = [];
constructor(permits: number) {
this.permits = permits;
}
async acquire(): Promise<void> {
if (this.permits > 0) {
this.permits--;
return;
}
return new Promise((resolve) => {
this.queue.push(resolve);
});
}
release(): void {
const next = this.queue.shift();
if (next) {
next();
} else {
this.permits++;
}
}
async runExclusive<T>(fn: () => Promise<T>): Promise<T> {
await this.acquire();
try {
const url = `${discoverSvcUrl.replace(/\/$/, '')}/api/v1/discover?topic=${topic}`;
const res = await fetch(url, { signal: AbortSignal.timeout(60000) });
if (!res.ok) {
console.warn(`[discover] ${topic}: HTTP ${res.status}`);
continue;
}
const data = await res.json();
const key = `discover:${topic}`;
await redis.setex(key, 30 * 60, JSON.stringify(data));
console.log(`[discover] ${topic}: cached ${data?.items?.length ?? 0} items`);
} catch (err) {
console.error(`[discover] ${topic}:`, err);
return await fn();
} finally {
this.release();
}
}
}
async function scrapeViaCrawl4ai(url: string): Promise<{ title: string; markdown: string }> {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: {
type: 'CrawlerRunConfig',
params: { cache_mode: 'default', page_timeout: 15000 },
},
}),
signal: AbortSignal.timeout(CRAWL4AI_TIMEOUT_MS),
});
if (!res.ok) throw new Error(`Crawl4AI HTTP ${res.status}`);
const data = (await res.json()) as {
results?: { url: string; markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
};
const first = data.results?.[0];
const md = extractCrawl4aiMarkdown(first?.markdown);
if (!first?.success || !md) throw new Error('Crawl4AI empty result');
return { title: first.metadata?.title ?? '', markdown: md.slice(0, 10000) };
}
async function scrapeFallback(url: string): Promise<{ title: string; markdown: string }> {
const res = await fetch(url, {
signal: AbortSignal.timeout(10000),
headers: { 'User-Agent': 'GooSeek-CacheWorker/1.0' },
});
const html = await res.text();
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? '';
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
const markdown = body
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 10000);
return { title, markdown };
}
async function scrapeArticle(blog: RawBlog): Promise<ScrapedArticle> {
let title = blog.title || blog.url;
let markdown = (blog.content ?? '').trim();
try {
const scraped = await scrapeViaCrawl4ai(blog.url);
if (scraped.markdown.length > 100) {
markdown = scraped.markdown;
if (scraped.title) title = scraped.title;
}
} catch {
try {
const fallback = await scrapeFallback(blog.url);
if (fallback.markdown.length > 100) {
markdown = fallback.markdown;
if (fallback.title) title = fallback.title;
}
} catch {
// keep original content
}
}
return { url: blog.url, title, markdown, thumbnail: blog.thumbnail ?? '' };
}
let totalInputChars = 0;
let totalOutputChars = 0;
let totalLlmCalls = 0;
function estimateTokens(chars: number): number {
return Math.ceil(chars / 4);
}
async function callLlm(
systemPrompt: string,
userPrompt: string,
timeoutMs: number = LLM_TIMEOUT_MS,
): Promise<string> {
if (!LLM_SVC_URL) throw new Error('LLM_SVC_URL not set');
const base = LLM_SVC_URL.replace(/\/$/, '');
const inputChars = systemPrompt.length + userPrompt.length;
const res = await fetch(`${base}/api/v1/generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: { providerId: LLM_DISCOVER_PROVIDER, key: LLM_DISCOVER_MODEL },
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userPrompt },
],
}),
signal: AbortSignal.timeout(timeoutMs),
});
if (!res.ok) throw new Error(`LLM HTTP ${res.status}`);
const data = (await res.json()) as { content?: string };
const output = (data.content ?? '').trim();
totalInputChars += inputChars;
totalOutputChars += output.length;
totalLlmCalls++;
return output;
}
function logTokenUsage(): void {
const inputTokens = estimateTokens(totalInputChars);
const outputTokens = estimateTokens(totalOutputChars);
const costInput = (inputTokens / 1_000_000) * 0.15;
const costOutput = (outputTokens / 1_000_000) * 0.60;
console.log(`[discover] Token usage: ~${inputTokens} input, ~${outputTokens} output (${totalLlmCalls} calls)`);
console.log(`[discover] Estimated cost (gpt-4o-mini): $${(costInput + costOutput).toFixed(4)}`);
}
function parseJsonFromLlm<T>(raw: string): T {
const cleaned = raw.replace(/^```json?\s*/i, '').replace(/\s*```$/i, '').trim();
return JSON.parse(cleaned) as T;
}
async function clusterArticles(articles: RawBlog[]): Promise<ClusterDef[]> {
const listing = articles
.map((a, i) => `[${i}] ${a.title}${a.url}`)
.join('\n');
const systemPrompt = 'Ты группируешь новости по темам. Возвращаешь ТОЛЬКО валидный JSON массив без markdown.';
const userPrompt = `Сгруппируй эти ${articles.length} новостей по темам/историям. Каждая группа — одна "история" или тема.
Объедини похожие/дублирующиеся новости в одну группу.
Верни JSON массив: [{"clusterTitle": "Краткий заголовок темы на русском", "articleIndices": [0,2,5], "mainIndex": 0}]
Максимум ${MAX_CLUSTERS} групп. mainIndex — индекс главной/лучшей статьи в группе.
Статьи:
${listing}`;
for (let attempt = 0; attempt < LLM_RETRY_ATTEMPTS; attempt++) {
try {
if (attempt > 0) await new Promise((r) => setTimeout(r, LLM_RETRY_DELAY_MS));
const raw = await callLlm(systemPrompt, userPrompt);
const clusters = parseJsonFromLlm<ClusterDef[]>(raw);
if (Array.isArray(clusters) && clusters.length > 0) {
return clusters.slice(0, MAX_CLUSTERS);
}
} catch (e) {
console.warn(`[discover] cluster attempt ${attempt + 1} failed:`, e);
}
}
return [{
clusterTitle: articles[0]?.title ?? 'Новости',
articleIndices: articles.map((_, i) => i),
mainIndex: 0,
}];
}
async function synthesizeDigest(
clusterTitle: string,
scrapedArticles: ScrapedArticle[],
): Promise<SynthesisResult | null> {
const sourcesListing = scrapedArticles
.map((a, i) => `[${i + 1}] ${a.title}\n${a.url}\n${a.markdown.slice(0, 1500)}`)
.join('\n---\n');
const systemPrompt = 'Синтезируй новости в сводку на русском. JSON без markdown.';
const userPrompt = `Тема: "${clusterTitle}"
${sourcesListing}
Синтезируй в сводку 3-5 абзацев. Ссылки [N] на источники. Русский язык.
JSON: {"summaryRu":"...[1][2]...","citations":[{"index":1,"url":"","title":"","domain":""}],"followUp":["?","?"],"shortDescription":"..."}`;
for (let attempt = 0; attempt < LLM_RETRY_ATTEMPTS; attempt++) {
try {
if (attempt > 0) await new Promise((r) => setTimeout(r, LLM_RETRY_DELAY_MS));
const raw = await callLlm(systemPrompt, userPrompt, 180_000);
const result = parseJsonFromLlm<SynthesisResult>(raw);
if (result.summaryRu && result.citations) {
if (!result.citations.length) {
result.citations = scrapedArticles.map((a, i) => ({
index: i + 1,
url: a.url,
title: a.title,
domain: extractDomain(a.url),
}));
}
if (!result.followUp?.length) {
result.followUp = [];
}
if (!result.shortDescription) {
result.shortDescription = result.summaryRu.slice(0, 200) + '…';
}
return result;
}
} catch (e) {
console.warn(`[discover] synthesis attempt ${attempt + 1} failed for "${clusterTitle}":`, e);
}
}
return null;
}
async function saveDigest(
discoverBase: string,
payload: {
topic: string;
region: string;
clusterTitle: string;
summaryRu: string;
citations: DigestCitation[];
sourcesCount: number;
followUp: string[];
thumbnail: string;
shortDescription: string;
mainUrl: string;
},
): Promise<boolean> {
try {
const res = await fetch(`${discoverBase}/api/v1/discover/digest`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
signal: AbortSignal.timeout(10000),
});
return res.status === 204;
} catch (e) {
console.error(`[discover] save digest failed:`, e);
return false;
}
}
interface ClusterProcessingData {
cluster: ClusterDef;
clusterBlogs: RawBlog[];
topic: string;
region: string;
}
async function processTopicRegion(
baseDiscover: string,
topic: string,
region: string,
synthesisSemaphore: Semaphore,
): Promise<{ saved: number; total: number }> {
console.log(`[discover] ${topic}:${region} — starting digest pipeline`);
const rawUrl = `${baseDiscover}/api/v1/discover?topic=${topic}&region=${region}&mode=raw`;
const rawRes = await fetch(rawUrl, { signal: AbortSignal.timeout(60000) });
if (!rawRes.ok) {
console.warn(`[discover] ${topic}:${region} raw HTTP ${rawRes.status}`);
return { saved: 0, total: 0 };
}
const rawData = (await rawRes.json()) as { blogs?: RawBlog[] };
const blogs = Array.isArray(rawData?.blogs) ? rawData.blogs : [];
if (blogs.length === 0) {
console.log(`[discover] ${topic}:${region} — no raw articles`);
return { saved: 0, total: 0 };
}
const articlesToProcess = blogs.slice(0, MAX_RAW_ARTICLES);
console.log(`[discover] ${topic}:${region}${articlesToProcess.length} raw articles`);
const clusters = await clusterArticles(articlesToProcess);
console.log(`[discover] ${topic}:${region}${clusters.length} clusters`);
const clusterData: ClusterProcessingData[] = clusters
.map((cluster) => {
const clusterBlogs = cluster.articleIndices
.filter((i) => i >= 0 && i < articlesToProcess.length)
.slice(0, MAX_URLS_PER_CLUSTER)
.map((i) => articlesToProcess[i]);
return { cluster, clusterBlogs, topic, region };
})
.filter((d) => d.clusterBlogs.length > 0);
const processCluster = async (data: ClusterProcessingData): Promise<boolean> => {
const { cluster, clusterBlogs, topic, region } = data;
const mainIdx = cluster.mainIndex >= 0 && cluster.mainIndex < clusterBlogs.length
? cluster.mainIndex
: 0;
const mainBlog = clusterBlogs[mainIdx];
try {
const checkRes = await fetch(
`${baseDiscover}/api/v1/discover/digest?url=${encodeURIComponent(mainBlog.url)}`,
{ signal: AbortSignal.timeout(5000) },
);
if (checkRes.ok) {
console.log(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — digest exists, skipping`);
return true;
}
} catch {
// continue with synthesis
}
const scrapeResults = await Promise.allSettled(
clusterBlogs.map((blog) => scrapeArticle(blog)),
);
const scraped = scrapeResults
.filter((r): r is PromiseFulfilledResult<ScrapedArticle> => r.status === 'fulfilled')
.map((r) => r.value)
.filter((a) => a.markdown.length > 50);
if (scraped.length === 0) {
console.warn(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — no scraped content`);
return false;
}
console.log(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — ${scraped.length} scraped`);
const synthesis = await synthesisSemaphore.runExclusive(() =>
synthesizeDigest(cluster.clusterTitle, scraped),
);
if (!synthesis) {
console.warn(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — synthesis failed`);
return false;
}
const mainScraped = scraped.find((s) => s.url === mainBlog.url);
const thumbnail = mainScraped?.thumbnail || mainBlog.thumbnail || '';
return saveDigest(baseDiscover, {
topic,
region,
clusterTitle: cluster.clusterTitle,
summaryRu: synthesis.summaryRu,
citations: synthesis.citations,
sourcesCount: scraped.length,
followUp: synthesis.followUp,
thumbnail,
shortDescription: synthesis.shortDescription,
mainUrl: mainBlog.url,
});
};
const results = await Promise.allSettled(
clusterData.map((data) => processCluster(data)),
);
const savedCount = results.filter(
(r) => r.status === 'fulfilled' && r.value === true,
).length;
console.log(`[discover] ${topic}:${region}${savedCount}/${clusters.length} digests saved`);
return { saved: savedCount, total: clusters.length };
}
export async function runDiscoverPrecompute(
_redis: RedisType,
discoverSvcUrl: string,
topicFilter?: DiscoverTopic,
): Promise<void> {
totalInputChars = 0;
totalOutputChars = 0;
totalLlmCalls = 0;
const baseDiscover = discoverSvcUrl.replace(/\/$/, '');
const topics = topicFilter && DISCOVER_TOPICS.includes(topicFilter)
? [topicFilter]
: [...DISCOVER_TOPICS];
const topicRegionPairs: { topic: string; region: string }[] = [];
for (const topic of topics) {
for (const region of REGIONS) {
topicRegionPairs.push({ topic, region });
}
}
const topicRegionSemaphore = new Semaphore(MAX_PARALLEL_TOPIC_REGIONS);
const synthesisSemaphore = new Semaphore(MAX_PARALLEL_SYNTHESIS);
const startTime = Date.now();
console.log(`[discover] Starting parallel processing of ${topicRegionPairs.length} topic/region pairs`);
const results = await Promise.allSettled(
topicRegionPairs.map(({ topic, region }) =>
topicRegionSemaphore.runExclusive(() =>
processTopicRegion(baseDiscover, topic, region, synthesisSemaphore),
),
),
);
const totalSaved = results
.filter((r): r is PromiseFulfilledResult<{ saved: number; total: number }> => r.status === 'fulfilled')
.reduce((sum, r) => sum + r.value.saved, 0);
const totalClusters = results
.filter((r): r is PromiseFulfilledResult<{ saved: number; total: number }> => r.status === 'fulfilled')
.reduce((sum, r) => sum + r.value.total, 0);
const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`[discover] Completed: ${totalSaved}/${totalClusters} digests saved in ${elapsedSec}s`);
logTokenUsage();
}