feat: Go backend, enhanced search, new widgets, Docker deploy
Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
FROM node:22-alpine AS builder
|
||||
WORKDIR /app
|
||||
COPY package*.json ./
|
||||
RUN npm ci
|
||||
COPY --from=npm-cache / /tmp/npm-cache
|
||||
RUN npm install --cache /tmp/npm-cache --prefer-offline --no-audit
|
||||
COPY tsconfig.json ./
|
||||
COPY src ./src
|
||||
RUN npm run build
|
||||
@@ -9,7 +11,8 @@ RUN npm run build
|
||||
FROM node:22-alpine
|
||||
WORKDIR /app
|
||||
COPY package*.json ./
|
||||
RUN npm ci --omit=dev
|
||||
COPY --from=npm-cache / /tmp/npm-cache
|
||||
RUN npm install --omit=dev --cache /tmp/npm-cache --prefer-offline --no-audit
|
||||
COPY --from=builder /app/dist ./dist
|
||||
EXPOSE 9090
|
||||
ENTRYPOINT ["node", "dist/run.js"]
|
||||
|
||||
@@ -11,7 +11,9 @@
|
||||
"build": "tsc"
|
||||
},
|
||||
"dependencies": {
|
||||
"ioredis": "^5.4.1"
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"ioredis": "^5.4.1",
|
||||
"socks-proxy-agent": "^8.0.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.10.0",
|
||||
|
||||
154
services/cache-worker/src/lib/crawl4ai-client.ts
Normal file
154
services/cache-worker/src/lib/crawl4ai-client.ts
Normal file
@@ -0,0 +1,154 @@
|
||||
/**
|
||||
* Crawl4AI REST API client for cache-worker with stealth support
|
||||
*/
|
||||
|
||||
export interface CrawlResult {
|
||||
url: string;
|
||||
title: string;
|
||||
content: string;
|
||||
success: boolean;
|
||||
statusCode?: number;
|
||||
}
|
||||
|
||||
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? '').replace(/\/$/, '');
|
||||
const TIMEOUT_MS = 45_000;
|
||||
const MAX_URLS_PER_BATCH = 15;
|
||||
|
||||
const USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
||||
];
|
||||
|
||||
function randomUserAgent(): string {
|
||||
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
||||
}
|
||||
|
||||
function extractCrawl4aiMarkdown(md: unknown): string {
|
||||
if (typeof md === 'string') return md;
|
||||
if (md && typeof md === 'object') {
|
||||
const obj = md as Record<string, unknown>;
|
||||
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
|
||||
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
|
||||
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0)
|
||||
return obj.fit_markdown;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function stripHtmlToText(html: string): string {
|
||||
return html
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.slice(0, 50_000);
|
||||
}
|
||||
|
||||
async function fetchFallback(url: string): Promise<CrawlResult> {
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
signal: AbortSignal.timeout(15000),
|
||||
headers: {
|
||||
'User-Agent': randomUserAgent(),
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
},
|
||||
});
|
||||
const html = await res.text();
|
||||
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
|
||||
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
||||
const body = bodyMatch ? bodyMatch[1] : html;
|
||||
const content = stripHtmlToText(body);
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
content: content || title,
|
||||
success: res.ok,
|
||||
statusCode: res.status,
|
||||
};
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
return {
|
||||
url,
|
||||
title: `Error: ${url}`,
|
||||
content: `Failed to fetch: ${msg}`,
|
||||
success: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Crawl multiple URLs via Crawl4AI with stealth settings
|
||||
*/
|
||||
export async function crawlUrls(urls: string[]): Promise<CrawlResult[]> {
|
||||
const list = urls.slice(0, MAX_URLS_PER_BATCH);
|
||||
if (list.length === 0) return [];
|
||||
|
||||
if (!CRAWL4AI_URL) {
|
||||
return Promise.all(list.map(fetchFallback));
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
urls: list,
|
||||
crawler_config: {
|
||||
type: 'CrawlerRunConfig',
|
||||
params: {
|
||||
cache_mode: 'bypass',
|
||||
page_timeout: 30000,
|
||||
user_agent: randomUserAgent(),
|
||||
simulate_user: true,
|
||||
override_navigator: true,
|
||||
},
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(TIMEOUT_MS),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`Crawl4AI HTTP ${res.status}`);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as {
|
||||
results?: Array<{
|
||||
url?: string;
|
||||
success?: boolean;
|
||||
markdown?: unknown;
|
||||
metadata?: { title?: string };
|
||||
status_code?: number;
|
||||
}>;
|
||||
};
|
||||
|
||||
const rawResults = data.results ?? [];
|
||||
const byUrl = new Map<string, (typeof rawResults)[0]>();
|
||||
for (const r of rawResults) {
|
||||
const u = r.url ?? '';
|
||||
if (u && !byUrl.has(u)) byUrl.set(u, r);
|
||||
}
|
||||
|
||||
const results: CrawlResult[] = [];
|
||||
for (const url of list) {
|
||||
const r = byUrl.get(url);
|
||||
const markdown = extractCrawl4aiMarkdown(r?.markdown);
|
||||
if (r?.success && markdown.length > 100) {
|
||||
results.push({
|
||||
url,
|
||||
title: r.metadata?.title ?? url,
|
||||
content: markdown,
|
||||
success: true,
|
||||
statusCode: r.status_code,
|
||||
});
|
||||
} else {
|
||||
results.push(await fetchFallback(url));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
} catch {
|
||||
return Promise.all(list.map(fetchFallback));
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,15 @@
|
||||
/**
|
||||
* cache-worker — pre-compute discover, finance, travel
|
||||
* cache-worker — pre-compute discover, finance, travel, digest-queue
|
||||
* docs/architecture: 03-cache-and-precompute-strategy.md
|
||||
* Cron: discover every 15m, finance every 2m, travel every 4h
|
||||
* Usage: node run.js --task=discover|finance|travel|all
|
||||
* Cron: discover every 15m, finance every 2m, travel every 4h, digest-queue continuous
|
||||
* Usage: node run.js --task=discover|finance|travel|digest-queue|all [--topic=tech|finance|art|sports|entertainment]
|
||||
*/
|
||||
|
||||
import Redis from 'ioredis';
|
||||
import { runDiscoverPrecompute } from './tasks/discover.js';
|
||||
import {
|
||||
runDiscoverPrecompute,
|
||||
type DiscoverTopic,
|
||||
} from './tasks/discover.js';
|
||||
import { runFinancePrecompute } from './tasks/finance.js';
|
||||
import { runTravelPrecompute } from './tasks/travel.js';
|
||||
|
||||
@@ -14,20 +17,165 @@ const REDIS_URL = process.env.REDIS_URL ?? 'redis://localhost:6379';
|
||||
const DISCOVER_SVC_URL = process.env.DISCOVER_SVC_URL ?? 'http://localhost:3002';
|
||||
const FINANCE_SVC_URL = process.env.FINANCE_SVC_URL ?? 'http://localhost:3003';
|
||||
const TRAVEL_SVC_URL = process.env.TRAVEL_SVC_URL ?? 'http://localhost:3004';
|
||||
const MASTER_AGENTS_SVC_URL = process.env.MASTER_AGENTS_SVC_URL ?? 'http://localhost:3006';
|
||||
const DIGEST_QUEUE_BATCH_SIZE = 5;
|
||||
const DIGEST_QUEUE_POLL_INTERVAL_MS = 5000;
|
||||
|
||||
function getTask(): string {
|
||||
const idx = process.argv.indexOf('--task');
|
||||
return idx >= 0 && process.argv[idx + 1] ? process.argv[idx + 1] : 'all';
|
||||
}
|
||||
|
||||
function getDiscoverTopic(): DiscoverTopic | undefined {
|
||||
const idx = process.argv.indexOf('--topic');
|
||||
const value = idx >= 0 ? process.argv[idx + 1] : undefined;
|
||||
if (
|
||||
value &&
|
||||
['tech', 'finance', 'art', 'sports', 'entertainment'].includes(value)
|
||||
) {
|
||||
return value as DiscoverTopic;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
async function processDigestQueueItem(
|
||||
discoverBase: string,
|
||||
item: { url: string; title: string },
|
||||
): Promise<boolean> {
|
||||
console.log(`[digest-queue] Processing: ${item.url.slice(0, 60)}`);
|
||||
|
||||
try {
|
||||
const res = await fetch(`${MASTER_AGENTS_SVC_URL.replace(/\/$/, '')}/api/v1/agents/search`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
message: {
|
||||
messageId: crypto.randomUUID(),
|
||||
chatId: 'digest-queue',
|
||||
content: `Summary: ${item.url}`,
|
||||
},
|
||||
optimizationMode: 'balanced',
|
||||
sources: ['web'],
|
||||
history: [],
|
||||
files: [],
|
||||
chatModel: { providerId: 'env', key: 'default' },
|
||||
systemInstructions: '',
|
||||
locale: 'ru',
|
||||
answerMode: 'standard',
|
||||
}),
|
||||
signal: AbortSignal.timeout(180000),
|
||||
duplex: 'half',
|
||||
} as RequestInit);
|
||||
|
||||
if (!res.ok) {
|
||||
console.warn(`[digest-queue] Failed to generate summary for ${item.url}: HTTP ${res.status}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
const events: string[] = [];
|
||||
const reader = res.body?.getReader();
|
||||
if (!reader) return false;
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
|
||||
while (true) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() ?? '';
|
||||
for (const line of lines) {
|
||||
if (line.trim()) events.push(line);
|
||||
}
|
||||
}
|
||||
if (buffer.trim()) events.push(buffer);
|
||||
|
||||
if (events.length > 0) {
|
||||
await fetch(`${discoverBase}/api/v1/discover/article-summary`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url: item.url, events }),
|
||||
signal: AbortSignal.timeout(30000),
|
||||
});
|
||||
console.log(`[digest-queue] Saved summary for ${item.url.slice(0, 60)}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
} catch (err) {
|
||||
console.error(`[digest-queue] Error processing ${item.url}:`, err);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function runDigestQueueProcessor(discoverSvcUrl: string): Promise<void> {
|
||||
const baseDiscover = discoverSvcUrl.replace(/\/$/, '');
|
||||
let processed = 0;
|
||||
let errors = 0;
|
||||
const maxIterations = 100;
|
||||
|
||||
console.log('[digest-queue] Starting queue processor');
|
||||
|
||||
for (let i = 0; i < maxIterations; i++) {
|
||||
try {
|
||||
const res = await fetch(`${baseDiscover}/api/v1/discover/queue`, {
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
console.warn(`[digest-queue] Queue fetch failed: HTTP ${res.status}`);
|
||||
await new Promise((r) => setTimeout(r, DIGEST_QUEUE_POLL_INTERVAL_MS));
|
||||
continue;
|
||||
}
|
||||
|
||||
const data = (await res.json()) as { item: { url: string; title: string } | null; queueLength: number };
|
||||
|
||||
if (!data.item) {
|
||||
console.log(`[digest-queue] Queue empty, stopping`);
|
||||
break;
|
||||
}
|
||||
|
||||
console.log(`[digest-queue] Queue length: ${data.queueLength}`);
|
||||
|
||||
const success = await processDigestQueueItem(baseDiscover, data.item);
|
||||
|
||||
await fetch(`${baseDiscover}/api/v1/discover/queue?url=${encodeURIComponent(data.item.url)}&requeue=${!success}`, {
|
||||
method: 'DELETE',
|
||||
signal: AbortSignal.timeout(5000),
|
||||
}).catch(() => {});
|
||||
|
||||
if (success) {
|
||||
processed++;
|
||||
} else {
|
||||
errors++;
|
||||
}
|
||||
|
||||
if (errors > 10) {
|
||||
console.warn('[digest-queue] Too many errors, stopping');
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
console.error('[digest-queue] Iteration error:', err);
|
||||
errors++;
|
||||
await new Promise((r) => setTimeout(r, DIGEST_QUEUE_POLL_INTERVAL_MS));
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[digest-queue] Done: ${processed} processed, ${errors} errors`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const task = getTask();
|
||||
const discoverTopic = getDiscoverTopic();
|
||||
// @ts-expect-error — ioredis + NodeNext ESM constructability
|
||||
const redis = new Redis(REDIS_URL);
|
||||
try {
|
||||
if (task === 'discover' || task === 'all') {
|
||||
console.log('[cache-worker] Running discover precompute...');
|
||||
await runDiscoverPrecompute(redis, DISCOVER_SVC_URL);
|
||||
const label = discoverTopic ? `[${discoverTopic}]` : '';
|
||||
console.log(`[cache-worker] Running discover precompute ${label}...`);
|
||||
await runDiscoverPrecompute(redis, DISCOVER_SVC_URL, discoverTopic);
|
||||
}
|
||||
if (task === 'finance' || task === 'all') {
|
||||
console.log('[cache-worker] Running finance precompute...');
|
||||
@@ -37,6 +185,10 @@ async function main() {
|
||||
console.log('[cache-worker] Running travel precompute...');
|
||||
await runTravelPrecompute(redis, TRAVEL_SVC_URL);
|
||||
}
|
||||
if (task === 'digest-queue') {
|
||||
console.log('[cache-worker] Running digest queue processor...');
|
||||
await runDigestQueueProcessor(DISCOVER_SVC_URL);
|
||||
}
|
||||
console.log('[cache-worker] Done');
|
||||
} finally {
|
||||
await redis.quit();
|
||||
|
||||
@@ -1,30 +1,519 @@
|
||||
/**
|
||||
* Discover pre-compute: вызывает discover-svc для всех тем, заполняет Redis
|
||||
* Redis: discover:{topic} TTL 30 min
|
||||
* Discover pre-compute: Perplexity-style multi-source digests.
|
||||
*
|
||||
* Pipeline per topic+region:
|
||||
* 1. Fetch raw articles from discover-svc (SearXNG)
|
||||
* 2. Cluster articles by topic via LLM
|
||||
* 3. Scrape all URLs in each cluster via Crawl4AI
|
||||
* 4. Synthesize multi-source summary with inline citations via LLM
|
||||
* 5. Save digest to discover-svc (SQLite + Redis invalidation)
|
||||
*
|
||||
* Fallback: if Crawl4AI is unavailable, falls back to basic fetch+strip.
|
||||
* Fallback: if clustering LLM fails, treat all articles as one cluster.
|
||||
*/
|
||||
|
||||
import type { Redis as RedisType } from 'ioredis';
|
||||
|
||||
const TOPICS = ['tech', 'finance', 'travel', 'world', 'science'];
|
||||
export const DISCOVER_TOPICS = ['tech', 'finance', 'art', 'sports', 'entertainment'] as const;
|
||||
export type DiscoverTopic = (typeof DISCOVER_TOPICS)[number];
|
||||
const REGIONS = ['america', 'eu', 'world'] as const;
|
||||
const MAX_RAW_ARTICLES = 20;
|
||||
const MAX_CLUSTERS = 5;
|
||||
const MAX_URLS_PER_CLUSTER = 5;
|
||||
const CRAWL4AI_TIMEOUT_MS = 25_000;
|
||||
const LLM_TIMEOUT_MS = 120_000;
|
||||
const LLM_RETRY_ATTEMPTS = 3;
|
||||
const LLM_RETRY_DELAY_MS = 5000;
|
||||
const MAX_PARALLEL_SYNTHESIS = 3;
|
||||
const MAX_PARALLEL_TOPIC_REGIONS = 2;
|
||||
|
||||
export async function runDiscoverPrecompute(
|
||||
redis: RedisType,
|
||||
discoverSvcUrl: string
|
||||
): Promise<void> {
|
||||
for (const topic of TOPICS) {
|
||||
const LLM_SVC_URL = process.env.LLM_SVC_URL?.trim() ?? '';
|
||||
const LLM_DISCOVER_PROVIDER = process.env.LLM_DISCOVER_PROVIDER?.trim() || 'env';
|
||||
const LLM_DISCOVER_MODEL = process.env.LLM_DISCOVER_MODEL?.trim() || 'gpt-4o-mini';
|
||||
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
|
||||
|
||||
interface RawBlog {
|
||||
title: string;
|
||||
content?: string;
|
||||
url: string;
|
||||
thumbnail?: string;
|
||||
}
|
||||
|
||||
interface ClusterDef {
|
||||
clusterTitle: string;
|
||||
articleIndices: number[];
|
||||
mainIndex: number;
|
||||
}
|
||||
|
||||
interface DigestCitation {
|
||||
index: number;
|
||||
url: string;
|
||||
title: string;
|
||||
domain: string;
|
||||
}
|
||||
|
||||
interface SynthesisResult {
|
||||
summaryRu: string;
|
||||
citations: DigestCitation[];
|
||||
followUp: string[];
|
||||
shortDescription: string;
|
||||
}
|
||||
|
||||
interface ScrapedArticle {
|
||||
url: string;
|
||||
title: string;
|
||||
markdown: string;
|
||||
thumbnail: string;
|
||||
}
|
||||
|
||||
function extractCrawl4aiMarkdown(md: unknown): string {
|
||||
if (typeof md === 'string') return md;
|
||||
if (md && typeof md === 'object') {
|
||||
const obj = md as Record<string, unknown>;
|
||||
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
|
||||
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
|
||||
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function extractDomain(url: string): string {
|
||||
try {
|
||||
return new URL(url).hostname.replace(/^www\./, '');
|
||||
} catch {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
class Semaphore {
|
||||
private permits: number;
|
||||
private queue: (() => void)[] = [];
|
||||
|
||||
constructor(permits: number) {
|
||||
this.permits = permits;
|
||||
}
|
||||
|
||||
async acquire(): Promise<void> {
|
||||
if (this.permits > 0) {
|
||||
this.permits--;
|
||||
return;
|
||||
}
|
||||
return new Promise((resolve) => {
|
||||
this.queue.push(resolve);
|
||||
});
|
||||
}
|
||||
|
||||
release(): void {
|
||||
const next = this.queue.shift();
|
||||
if (next) {
|
||||
next();
|
||||
} else {
|
||||
this.permits++;
|
||||
}
|
||||
}
|
||||
|
||||
async runExclusive<T>(fn: () => Promise<T>): Promise<T> {
|
||||
await this.acquire();
|
||||
try {
|
||||
const url = `${discoverSvcUrl.replace(/\/$/, '')}/api/v1/discover?topic=${topic}`;
|
||||
const res = await fetch(url, { signal: AbortSignal.timeout(60000) });
|
||||
if (!res.ok) {
|
||||
console.warn(`[discover] ${topic}: HTTP ${res.status}`);
|
||||
continue;
|
||||
}
|
||||
const data = await res.json();
|
||||
const key = `discover:${topic}`;
|
||||
await redis.setex(key, 30 * 60, JSON.stringify(data));
|
||||
console.log(`[discover] ${topic}: cached ${data?.items?.length ?? 0} items`);
|
||||
} catch (err) {
|
||||
console.error(`[discover] ${topic}:`, err);
|
||||
return await fn();
|
||||
} finally {
|
||||
this.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeViaCrawl4ai(url: string): Promise<{ title: string; markdown: string }> {
|
||||
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
urls: [url],
|
||||
crawler_config: {
|
||||
type: 'CrawlerRunConfig',
|
||||
params: { cache_mode: 'default', page_timeout: 15000 },
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(CRAWL4AI_TIMEOUT_MS),
|
||||
});
|
||||
if (!res.ok) throw new Error(`Crawl4AI HTTP ${res.status}`);
|
||||
const data = (await res.json()) as {
|
||||
results?: { url: string; markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
|
||||
};
|
||||
const first = data.results?.[0];
|
||||
const md = extractCrawl4aiMarkdown(first?.markdown);
|
||||
if (!first?.success || !md) throw new Error('Crawl4AI empty result');
|
||||
return { title: first.metadata?.title ?? '', markdown: md.slice(0, 10000) };
|
||||
}
|
||||
|
||||
async function scrapeFallback(url: string): Promise<{ title: string; markdown: string }> {
|
||||
const res = await fetch(url, {
|
||||
signal: AbortSignal.timeout(10000),
|
||||
headers: { 'User-Agent': 'GooSeek-CacheWorker/1.0' },
|
||||
});
|
||||
const html = await res.text();
|
||||
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? '';
|
||||
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
||||
const body = bodyMatch ? bodyMatch[1] : html;
|
||||
const markdown = body
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.slice(0, 10000);
|
||||
return { title, markdown };
|
||||
}
|
||||
|
||||
async function scrapeArticle(blog: RawBlog): Promise<ScrapedArticle> {
|
||||
let title = blog.title || blog.url;
|
||||
let markdown = (blog.content ?? '').trim();
|
||||
|
||||
try {
|
||||
const scraped = await scrapeViaCrawl4ai(blog.url);
|
||||
if (scraped.markdown.length > 100) {
|
||||
markdown = scraped.markdown;
|
||||
if (scraped.title) title = scraped.title;
|
||||
}
|
||||
} catch {
|
||||
try {
|
||||
const fallback = await scrapeFallback(blog.url);
|
||||
if (fallback.markdown.length > 100) {
|
||||
markdown = fallback.markdown;
|
||||
if (fallback.title) title = fallback.title;
|
||||
}
|
||||
} catch {
|
||||
// keep original content
|
||||
}
|
||||
}
|
||||
|
||||
return { url: blog.url, title, markdown, thumbnail: blog.thumbnail ?? '' };
|
||||
}
|
||||
|
||||
let totalInputChars = 0;
|
||||
let totalOutputChars = 0;
|
||||
let totalLlmCalls = 0;
|
||||
|
||||
function estimateTokens(chars: number): number {
|
||||
return Math.ceil(chars / 4);
|
||||
}
|
||||
|
||||
async function callLlm(
|
||||
systemPrompt: string,
|
||||
userPrompt: string,
|
||||
timeoutMs: number = LLM_TIMEOUT_MS,
|
||||
): Promise<string> {
|
||||
if (!LLM_SVC_URL) throw new Error('LLM_SVC_URL not set');
|
||||
const base = LLM_SVC_URL.replace(/\/$/, '');
|
||||
const inputChars = systemPrompt.length + userPrompt.length;
|
||||
|
||||
const res = await fetch(`${base}/api/v1/generate`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
model: { providerId: LLM_DISCOVER_PROVIDER, key: LLM_DISCOVER_MODEL },
|
||||
messages: [
|
||||
{ role: 'system', content: systemPrompt },
|
||||
{ role: 'user', content: userPrompt },
|
||||
],
|
||||
}),
|
||||
signal: AbortSignal.timeout(timeoutMs),
|
||||
});
|
||||
if (!res.ok) throw new Error(`LLM HTTP ${res.status}`);
|
||||
const data = (await res.json()) as { content?: string };
|
||||
const output = (data.content ?? '').trim();
|
||||
|
||||
totalInputChars += inputChars;
|
||||
totalOutputChars += output.length;
|
||||
totalLlmCalls++;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
function logTokenUsage(): void {
|
||||
const inputTokens = estimateTokens(totalInputChars);
|
||||
const outputTokens = estimateTokens(totalOutputChars);
|
||||
const costInput = (inputTokens / 1_000_000) * 0.15;
|
||||
const costOutput = (outputTokens / 1_000_000) * 0.60;
|
||||
console.log(`[discover] Token usage: ~${inputTokens} input, ~${outputTokens} output (${totalLlmCalls} calls)`);
|
||||
console.log(`[discover] Estimated cost (gpt-4o-mini): $${(costInput + costOutput).toFixed(4)}`);
|
||||
}
|
||||
|
||||
function parseJsonFromLlm<T>(raw: string): T {
|
||||
const cleaned = raw.replace(/^```json?\s*/i, '').replace(/\s*```$/i, '').trim();
|
||||
return JSON.parse(cleaned) as T;
|
||||
}
|
||||
|
||||
async function clusterArticles(articles: RawBlog[]): Promise<ClusterDef[]> {
|
||||
const listing = articles
|
||||
.map((a, i) => `[${i}] ${a.title} — ${a.url}`)
|
||||
.join('\n');
|
||||
|
||||
const systemPrompt = 'Ты группируешь новости по темам. Возвращаешь ТОЛЬКО валидный JSON массив без markdown.';
|
||||
const userPrompt = `Сгруппируй эти ${articles.length} новостей по темам/историям. Каждая группа — одна "история" или тема.
|
||||
Объедини похожие/дублирующиеся новости в одну группу.
|
||||
Верни JSON массив: [{"clusterTitle": "Краткий заголовок темы на русском", "articleIndices": [0,2,5], "mainIndex": 0}]
|
||||
Максимум ${MAX_CLUSTERS} групп. mainIndex — индекс главной/лучшей статьи в группе.
|
||||
|
||||
Статьи:
|
||||
${listing}`;
|
||||
|
||||
for (let attempt = 0; attempt < LLM_RETRY_ATTEMPTS; attempt++) {
|
||||
try {
|
||||
if (attempt > 0) await new Promise((r) => setTimeout(r, LLM_RETRY_DELAY_MS));
|
||||
const raw = await callLlm(systemPrompt, userPrompt);
|
||||
const clusters = parseJsonFromLlm<ClusterDef[]>(raw);
|
||||
if (Array.isArray(clusters) && clusters.length > 0) {
|
||||
return clusters.slice(0, MAX_CLUSTERS);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[discover] cluster attempt ${attempt + 1} failed:`, e);
|
||||
}
|
||||
}
|
||||
|
||||
return [{
|
||||
clusterTitle: articles[0]?.title ?? 'Новости',
|
||||
articleIndices: articles.map((_, i) => i),
|
||||
mainIndex: 0,
|
||||
}];
|
||||
}
|
||||
|
||||
async function synthesizeDigest(
|
||||
clusterTitle: string,
|
||||
scrapedArticles: ScrapedArticle[],
|
||||
): Promise<SynthesisResult | null> {
|
||||
const sourcesListing = scrapedArticles
|
||||
.map((a, i) => `[${i + 1}] ${a.title}\n${a.url}\n${a.markdown.slice(0, 1500)}`)
|
||||
.join('\n---\n');
|
||||
|
||||
const systemPrompt = 'Синтезируй новости в сводку на русском. JSON без markdown.';
|
||||
|
||||
const userPrompt = `Тема: "${clusterTitle}"
|
||||
|
||||
${sourcesListing}
|
||||
|
||||
Синтезируй в сводку 3-5 абзацев. Ссылки [N] на источники. Русский язык.
|
||||
JSON: {"summaryRu":"...[1][2]...","citations":[{"index":1,"url":"","title":"","domain":""}],"followUp":["?","?"],"shortDescription":"..."}`;
|
||||
|
||||
for (let attempt = 0; attempt < LLM_RETRY_ATTEMPTS; attempt++) {
|
||||
try {
|
||||
if (attempt > 0) await new Promise((r) => setTimeout(r, LLM_RETRY_DELAY_MS));
|
||||
const raw = await callLlm(systemPrompt, userPrompt, 180_000);
|
||||
const result = parseJsonFromLlm<SynthesisResult>(raw);
|
||||
if (result.summaryRu && result.citations) {
|
||||
if (!result.citations.length) {
|
||||
result.citations = scrapedArticles.map((a, i) => ({
|
||||
index: i + 1,
|
||||
url: a.url,
|
||||
title: a.title,
|
||||
domain: extractDomain(a.url),
|
||||
}));
|
||||
}
|
||||
if (!result.followUp?.length) {
|
||||
result.followUp = [];
|
||||
}
|
||||
if (!result.shortDescription) {
|
||||
result.shortDescription = result.summaryRu.slice(0, 200) + '…';
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[discover] synthesis attempt ${attempt + 1} failed for "${clusterTitle}":`, e);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function saveDigest(
|
||||
discoverBase: string,
|
||||
payload: {
|
||||
topic: string;
|
||||
region: string;
|
||||
clusterTitle: string;
|
||||
summaryRu: string;
|
||||
citations: DigestCitation[];
|
||||
sourcesCount: number;
|
||||
followUp: string[];
|
||||
thumbnail: string;
|
||||
shortDescription: string;
|
||||
mainUrl: string;
|
||||
},
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
const res = await fetch(`${discoverBase}/api/v1/discover/digest`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
return res.status === 204;
|
||||
} catch (e) {
|
||||
console.error(`[discover] save digest failed:`, e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
interface ClusterProcessingData {
|
||||
cluster: ClusterDef;
|
||||
clusterBlogs: RawBlog[];
|
||||
topic: string;
|
||||
region: string;
|
||||
}
|
||||
|
||||
async function processTopicRegion(
|
||||
baseDiscover: string,
|
||||
topic: string,
|
||||
region: string,
|
||||
synthesisSemaphore: Semaphore,
|
||||
): Promise<{ saved: number; total: number }> {
|
||||
console.log(`[discover] ${topic}:${region} — starting digest pipeline`);
|
||||
|
||||
const rawUrl = `${baseDiscover}/api/v1/discover?topic=${topic}®ion=${region}&mode=raw`;
|
||||
const rawRes = await fetch(rawUrl, { signal: AbortSignal.timeout(60000) });
|
||||
if (!rawRes.ok) {
|
||||
console.warn(`[discover] ${topic}:${region} raw HTTP ${rawRes.status}`);
|
||||
return { saved: 0, total: 0 };
|
||||
}
|
||||
const rawData = (await rawRes.json()) as { blogs?: RawBlog[] };
|
||||
const blogs = Array.isArray(rawData?.blogs) ? rawData.blogs : [];
|
||||
if (blogs.length === 0) {
|
||||
console.log(`[discover] ${topic}:${region} — no raw articles`);
|
||||
return { saved: 0, total: 0 };
|
||||
}
|
||||
|
||||
const articlesToProcess = blogs.slice(0, MAX_RAW_ARTICLES);
|
||||
console.log(`[discover] ${topic}:${region} — ${articlesToProcess.length} raw articles`);
|
||||
|
||||
const clusters = await clusterArticles(articlesToProcess);
|
||||
console.log(`[discover] ${topic}:${region} — ${clusters.length} clusters`);
|
||||
|
||||
const clusterData: ClusterProcessingData[] = clusters
|
||||
.map((cluster) => {
|
||||
const clusterBlogs = cluster.articleIndices
|
||||
.filter((i) => i >= 0 && i < articlesToProcess.length)
|
||||
.slice(0, MAX_URLS_PER_CLUSTER)
|
||||
.map((i) => articlesToProcess[i]);
|
||||
return { cluster, clusterBlogs, topic, region };
|
||||
})
|
||||
.filter((d) => d.clusterBlogs.length > 0);
|
||||
|
||||
const processCluster = async (data: ClusterProcessingData): Promise<boolean> => {
|
||||
const { cluster, clusterBlogs, topic, region } = data;
|
||||
|
||||
const mainIdx = cluster.mainIndex >= 0 && cluster.mainIndex < clusterBlogs.length
|
||||
? cluster.mainIndex
|
||||
: 0;
|
||||
const mainBlog = clusterBlogs[mainIdx];
|
||||
|
||||
try {
|
||||
const checkRes = await fetch(
|
||||
`${baseDiscover}/api/v1/discover/digest?url=${encodeURIComponent(mainBlog.url)}`,
|
||||
{ signal: AbortSignal.timeout(5000) },
|
||||
);
|
||||
if (checkRes.ok) {
|
||||
console.log(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — digest exists, skipping`);
|
||||
return true;
|
||||
}
|
||||
} catch {
|
||||
// continue with synthesis
|
||||
}
|
||||
|
||||
const scrapeResults = await Promise.allSettled(
|
||||
clusterBlogs.map((blog) => scrapeArticle(blog)),
|
||||
);
|
||||
const scraped = scrapeResults
|
||||
.filter((r): r is PromiseFulfilledResult<ScrapedArticle> => r.status === 'fulfilled')
|
||||
.map((r) => r.value)
|
||||
.filter((a) => a.markdown.length > 50);
|
||||
|
||||
if (scraped.length === 0) {
|
||||
console.warn(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — no scraped content`);
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — ${scraped.length} scraped`);
|
||||
|
||||
const synthesis = await synthesisSemaphore.runExclusive(() =>
|
||||
synthesizeDigest(cluster.clusterTitle, scraped),
|
||||
);
|
||||
|
||||
if (!synthesis) {
|
||||
console.warn(`[discover] ${topic}:${region} cluster "${cluster.clusterTitle}" — synthesis failed`);
|
||||
return false;
|
||||
}
|
||||
|
||||
const mainScraped = scraped.find((s) => s.url === mainBlog.url);
|
||||
const thumbnail = mainScraped?.thumbnail || mainBlog.thumbnail || '';
|
||||
|
||||
return saveDigest(baseDiscover, {
|
||||
topic,
|
||||
region,
|
||||
clusterTitle: cluster.clusterTitle,
|
||||
summaryRu: synthesis.summaryRu,
|
||||
citations: synthesis.citations,
|
||||
sourcesCount: scraped.length,
|
||||
followUp: synthesis.followUp,
|
||||
thumbnail,
|
||||
shortDescription: synthesis.shortDescription,
|
||||
mainUrl: mainBlog.url,
|
||||
});
|
||||
};
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
clusterData.map((data) => processCluster(data)),
|
||||
);
|
||||
|
||||
const savedCount = results.filter(
|
||||
(r) => r.status === 'fulfilled' && r.value === true,
|
||||
).length;
|
||||
|
||||
console.log(`[discover] ${topic}:${region} — ${savedCount}/${clusters.length} digests saved`);
|
||||
return { saved: savedCount, total: clusters.length };
|
||||
}
|
||||
|
||||
export async function runDiscoverPrecompute(
|
||||
_redis: RedisType,
|
||||
discoverSvcUrl: string,
|
||||
topicFilter?: DiscoverTopic,
|
||||
): Promise<void> {
|
||||
totalInputChars = 0;
|
||||
totalOutputChars = 0;
|
||||
totalLlmCalls = 0;
|
||||
|
||||
const baseDiscover = discoverSvcUrl.replace(/\/$/, '');
|
||||
const topics = topicFilter && DISCOVER_TOPICS.includes(topicFilter)
|
||||
? [topicFilter]
|
||||
: [...DISCOVER_TOPICS];
|
||||
|
||||
const topicRegionPairs: { topic: string; region: string }[] = [];
|
||||
for (const topic of topics) {
|
||||
for (const region of REGIONS) {
|
||||
topicRegionPairs.push({ topic, region });
|
||||
}
|
||||
}
|
||||
|
||||
const topicRegionSemaphore = new Semaphore(MAX_PARALLEL_TOPIC_REGIONS);
|
||||
const synthesisSemaphore = new Semaphore(MAX_PARALLEL_SYNTHESIS);
|
||||
|
||||
const startTime = Date.now();
|
||||
console.log(`[discover] Starting parallel processing of ${topicRegionPairs.length} topic/region pairs`);
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
topicRegionPairs.map(({ topic, region }) =>
|
||||
topicRegionSemaphore.runExclusive(() =>
|
||||
processTopicRegion(baseDiscover, topic, region, synthesisSemaphore),
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
const totalSaved = results
|
||||
.filter((r): r is PromiseFulfilledResult<{ saved: number; total: number }> => r.status === 'fulfilled')
|
||||
.reduce((sum, r) => sum + r.value.saved, 0);
|
||||
|
||||
const totalClusters = results
|
||||
.filter((r): r is PromiseFulfilledResult<{ saved: number; total: number }> => r.status === 'fulfilled')
|
||||
.reduce((sum, r) => sum + r.value.total, 0);
|
||||
|
||||
const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(1);
|
||||
console.log(`[discover] Completed: ${totalSaved}/${totalClusters} digests saved in ${elapsedSec}s`);
|
||||
logTokenUsage();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user