feat: Go backend, enhanced search, new widgets, Docker deploy

Major changes:
- Add Go backend (backend/) with microservices architecture
- Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler,
  proxy-manager, media-search, fastClassifier, language detection
- New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard,
  UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions
- Improved discover-svc with discover-db integration
- Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md)
- Library-svc: project_id schema migration
- Remove deprecated finance-svc and travel-svc
- Localization improvements across services

Made-with: Cursor
This commit is contained in:
home
2026-02-27 04:15:32 +03:00
parent 328d968f3f
commit 06fe57c765
285 changed files with 53132 additions and 1871 deletions

View File

@@ -1,12 +1,136 @@
import type { LlmClient } from '../llm-client.js';
import SessionManager from '../session.js';
import type { TextBlock } from '../types.js';
import type { TextBlock, Chunk } from '../types.js';
import type { ClassifierOutput } from '../actions/types.js';
import { detectLanguage } from '../prompts/detectLanguage.js';
import { getClassifierPrompt } from '../prompts/classifier.js';
import { getWriterPrompt } from '../prompts/writer.js';
import { classify } from './classifier.js';
import { fastClassify, generateSearchQueries } from './fastClassifier.js';
import { research } from './researcher.js';
import { executeAllWidgets } from '../widgets/index.js';
import { searchMedia, type MediaSearchResult } from '../media-search.js';
import { searchSearxng } from '../searxng.js';
import { rerankBM25, computeAdaptiveTopK, estimateQueryComplexity } from '../reranker.js';
const DISCOVER_SVC_URL = (process.env.DISCOVER_SVC_URL ?? '').replace(/\/$/, '');
interface DigestResponse {
summaryRu: string;
citations: { index: number; url: string; title: string; domain: string }[];
followUp: string[];
sourcesCount: number;
clusterTitle: string;
}
async function fetchPreGeneratedDigest(articleUrl: string): Promise<DigestResponse | null> {
if (!DISCOVER_SVC_URL) return null;
try {
const res = await fetch(
`${DISCOVER_SVC_URL}/api/v1/discover/digest?url=${encodeURIComponent(articleUrl)}`,
{ signal: AbortSignal.timeout(3000) },
);
if (!res.ok) return null;
const data = (await res.json()) as DigestResponse;
if (data.summaryRu && data.citations?.length > 0) return data;
return null;
} catch {
return null;
}
}
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
}
return '';
}
async function preScrapeArticleUrl(url: string): Promise<{ title: string; content: string; url: string } | null> {
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: { type: 'CrawlerRunConfig', params: { cache_mode: 'default', page_timeout: 20000 } },
}),
signal: AbortSignal.timeout(25000),
});
if (!res.ok) throw new Error(`Crawl4AI HTTP ${res.status}`);
const data = (await res.json()) as {
results?: { markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
};
const first = data.results?.[0];
const md = extractCrawl4aiMarkdown(first?.markdown);
if (first?.success && md.length > 100) {
return {
title: first.metadata?.title ?? url,
content: md.slice(0, 15000),
url,
};
}
} catch {
// fallback to basic fetch
}
try {
const res = await fetch(url, {
signal: AbortSignal.timeout(10000),
headers: { 'User-Agent': 'GooSeek-Agent/1.0' },
});
const html = await res.text();
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
const text = body
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 15000);
if (text.length > 100) return { title, content: text, url };
} catch {
// give up
}
return null;
}
/** Слова запроса (длина >= 2) для подсчёта релевантности */
function queryTerms(query: string): Set<string> {
const normalized = query
.replace(/^Summary:\s*/i, '')
.replace(/https?:\/\/[^\s]+/g, '')
.toLowerCase();
const words = normalized.split(/\s+/).filter((w) => w.length >= 2);
return new Set(words);
}
/** Скор релевантности: сколько слов запроса встречается в title и content */
function relevanceScore(chunk: Chunk, terms: Set<string>): number {
const title = String(chunk.metadata?.title ?? '');
const content = (chunk.content ?? '').toLowerCase();
const titleLower = title.toLowerCase();
let score = 0;
for (const t of terms) {
if (titleLower.includes(t)) score += 3;
if (content.includes(t)) score += 1;
}
return score;
}
function rankByRelevance(chunks: Chunk[], query: string): Chunk[] {
if (chunks.length === 0) return [];
const terms = queryTerms(query);
if (terms.size === 0) return chunks;
return [...chunks].sort((a, b) => relevanceScore(b, terms) - relevanceScore(a, terms));
}
export type SearchOrchestratorConfig = {
llm: LlmClient;
@@ -27,20 +151,287 @@ export type SearchOrchestratorInput = {
config: SearchOrchestratorConfig;
};
/**
* SPEED MODE: Optimized fast path
* - No LLM classifier (rule-based)
* - Direct SearXNG search (no researcher loop)
* - No page scraping (snippets only)
* - Parallel: search + media + widgets
* Target: 8-15 seconds (like Perplexity)
*/
async function runSpeedMode(
session: SessionManager,
input: {
chatHistory: { role: string; content: string }[];
followUp: string;
config: SearchOrchestratorConfig;
detectedLanguage?: string;
},
): Promise<void> {
const { chatHistory, followUp, config, detectedLanguage } = input;
const classification = fastClassify(followUp, chatHistory);
const searchQuery = classification.standaloneFollowUp || followUp;
const queries = generateSearchQueries(searchQuery);
const researchBlockId = crypto.randomUUID();
session.emitBlock({
id: researchBlockId,
type: 'research',
data: { subSteps: [{ id: crypto.randomUUID(), type: 'searching', searching: queries }] },
});
const searchPromise = (async () => {
const results: Chunk[] = [];
const seenUrls = new Set<string>();
await Promise.all(
queries.map(async (q) => {
try {
const res = await searchSearxng(q, { categories: ['general', 'news'], pageno: 1 });
for (const r of res.results ?? []) {
if (r.url && !seenUrls.has(r.url)) {
seenUrls.add(r.url);
results.push({
content: r.content || r.title,
metadata: { title: r.title, url: r.url },
});
}
}
} catch { /* ignore search errors */ }
}),
);
return results;
})();
const widgetPromise = executeAllWidgets({
chatHistory,
followUp,
classification: classification as ClassifierOutput,
llm: config.llm,
}).then((outputs) => {
for (const o of outputs) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: { widgetType: o.type, params: o.data ?? {} },
});
}
return outputs;
});
const mediaPromise = searchMedia(searchQuery, { maxImages: 6, maxVideos: 4 }).then((mediaResult) => {
if (mediaResult.images.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'image_gallery',
params: { images: mediaResult.images, layout: 'carousel' },
},
});
}
if (mediaResult.videos.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'videos',
params: { items: mediaResult.videos, title: '' },
},
});
}
return mediaResult;
});
const [searchResults, widgetOutputs] = await Promise.all([
searchPromise,
widgetPromise,
mediaPromise,
]);
session.emitBlock({
id: crypto.randomUUID(),
type: 'source',
data: searchResults,
});
session.emit('data', { type: 'researchComplete' });
const queryComplexity = estimateQueryComplexity(searchQuery);
const adaptiveTopK = computeAdaptiveTopK(searchResults.length, queryComplexity, 'speed');
const rerankableItems = searchResults.map((r) => ({
content: r.content,
title: (r.metadata?.title as string) ?? '',
url: (r.metadata?.url as string) ?? '',
metadata: r.metadata,
}));
const rankedItems = rerankBM25(rerankableItems, searchQuery, adaptiveTopK);
const findingsForWriter = rankedItems.slice(0, 15).map((item) => ({
content: item.content,
metadata: item.metadata ?? { title: item.title, url: item.url },
}));
const MAX_CONTENT_PER_RESULT = 250;
const finalContext = findingsForWriter
.map((f, index) => {
const content = f.content.length > MAX_CONTENT_PER_RESULT
? f.content.slice(0, MAX_CONTENT_PER_RESULT) + '…'
: f.content;
return `<result index=${index + 1} title="${String(f.metadata?.title ?? '').replace(/"/g, "'")}">${content}</result>`;
})
.join('\n') || '';
const widgetContext = widgetOutputs
.map((o) => `<result>${o.llmContext}</result>`)
.join('\n-------------\n');
const finalContextWithWidgets =
`<search_results note="These are the search results and assistant can cite these">\n${finalContext}\n</search_results>\n` +
`<widgets_result noteForAssistant="Its output is already showed to the user, assistant can use this information to answer the query but do not CITE this as a source">\n${widgetContext}\n</widgets_result>`;
const writerPrompt = getWriterPrompt(
finalContextWithWidgets,
config.systemInstructions,
'speed',
config.locale,
config.memoryContext,
config.answerMode,
config.responsePrefs,
config.learningMode,
detectedLanguage,
false,
);
const answerStream = config.llm.streamText({
messages: [
{ role: 'system', content: writerPrompt },
...chatHistory,
{ role: 'user', content: followUp },
],
options: { maxTokens: 2048 },
});
let responseBlockId = '';
let hasContent = false;
let accumulatedText = '';
for await (const chunk of answerStream) {
const chunkText = chunk.contentChunk ?? '';
if (!chunkText && !responseBlockId) continue;
if (!responseBlockId) {
accumulatedText = chunkText;
const block: TextBlock = {
id: crypto.randomUUID(),
type: 'text',
data: chunkText,
};
session.emitBlock(block);
responseBlockId = block.id;
if (chunkText) hasContent = true;
} else if (chunkText) {
accumulatedText += chunkText;
hasContent = true;
session.emit('data', {
type: 'textChunk',
blockId: responseBlockId,
chunk: chunkText,
});
const block = session.getBlock(responseBlockId) as TextBlock | null;
if (block) {
block.data = accumulatedText;
}
}
}
if (responseBlockId) {
session.updateBlock(responseBlockId, [{ op: 'replace', path: '/data', value: accumulatedText }]);
}
if (!hasContent && findingsForWriter.length > 0) {
const lines = findingsForWriter.slice(0, 8).map((f, i) => {
const title = (f.metadata?.title as string) ?? 'Без названия';
const excerpt = f.content.length > 100 ? f.content.slice(0, 100) + '…' : f.content;
return `${i + 1}. **${title}** — ${excerpt}`;
});
session.emitBlock({
id: crypto.randomUUID(),
type: 'text',
data: `## По найденным источникам\n\n${lines.join('\n\n')}\n\n*Ответ LLM недоступен.*`,
});
}
session.emit('end', {});
}
export async function runSearchOrchestrator(
session: SessionManager,
input: SearchOrchestratorInput,
): Promise<void> {
const { chatHistory, followUp, config } = input;
const detectedLanguage = detectLanguage(followUp);
const isArticleSummary = followUp.trim().startsWith('Summary: ') && followUp.trim().length > 9;
const classification = await classify({
// SPEED MODE: Fast path — no LLM classifier, direct search, no scraping
if (config.mode === 'speed' && !isArticleSummary) {
await runSpeedMode(session, { chatHistory, followUp, config, detectedLanguage });
return;
}
let preScrapedArticle: { title: string; content: string; url: string } | null = null;
if (isArticleSummary) {
const articleUrl = followUp.trim().slice(9).trim();
const [digest, preScrapeResult] = await Promise.all([
fetchPreGeneratedDigest(articleUrl),
preScrapeArticleUrl(articleUrl),
]);
if (digest) {
const sourceBlock = {
id: crypto.randomUUID(),
type: 'source' as const,
data: digest.citations.map((c: { index: number; url: string; title: string; domain: string }) => ({
content: c.title,
metadata: { url: c.url, title: c.title, domain: c.domain, index: c.index },
})),
};
session.emitBlock(sourceBlock);
session.emit('data', { type: 'researchComplete' });
let summaryText = digest.summaryRu;
if (digest.followUp && digest.followUp.length > 0) {
summaryText += '\n\n---\n' + digest.followUp.map((q: string) => `> ${q}`).join('\n');
}
session.emitBlock({ id: crypto.randomUUID(), type: 'text', data: summaryText });
session.emit('end', {});
return;
}
preScrapedArticle = preScrapeResult;
}
let classification = await classify({
chatHistory,
query: followUp,
llm: config.llm,
locale: config.locale,
detectedLanguage,
enabledSources: config.sources,
});
if (isArticleSummary && classification.classification.skipSearch) {
classification = {
...classification,
classification: { ...classification.classification, skipSearch: false },
};
}
const widgetPromise = executeAllWidgets({
chatHistory,
followUp,
@@ -57,28 +448,87 @@ export async function runSearchOrchestrator(
return outputs;
});
const mediaQuery = classification.standaloneFollowUp || followUp;
const mediaPromise = !isArticleSummary
? searchMedia(mediaQuery, { maxImages: 8, maxVideos: 6 }).then((mediaResult) => {
if (mediaResult.images.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'image_gallery',
params: {
images: mediaResult.images,
layout: 'carousel',
},
},
});
}
if (mediaResult.videos.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'videos',
params: {
items: mediaResult.videos,
title: '',
},
},
});
}
return mediaResult;
})
: Promise.resolve({ images: [], videos: [] } as MediaSearchResult);
let searchPromise: Promise<{ searchFindings: import('../types.js').Chunk[] }> | null = null;
const effectiveFollowUp = (isArticleSummary && preScrapedArticle?.title)
? `Summary: ${preScrapedArticle.url}\nArticle title: ${preScrapedArticle.title}`
: followUp;
if (!classification.classification.skipSearch) {
searchPromise = research(session, config.llm, {
chatHistory,
followUp,
classification,
followUp: effectiveFollowUp,
classification: isArticleSummary && preScrapedArticle
? { ...classification, standaloneFollowUp: `${preScrapedArticle.title} ${classification.standaloneFollowUp}` }
: classification,
config: {
mode: config.mode,
sources: config.sources,
fileIds: config.fileIds,
locale: config.locale,
detectedLanguage,
isArticleSummary,
},
});
}
const [widgetOutputs, searchResults] = await Promise.all([widgetPromise, searchPromise ?? Promise.resolve({ searchFindings: [] })]);
const [widgetOutputs, searchResults] = await Promise.all([
widgetPromise,
searchPromise ?? Promise.resolve({ searchFindings: [] }),
mediaPromise,
]);
session.emit('data', { type: 'researchComplete' });
const MAX_RESULTS_FOR_WRITER = 15;
const MAX_CONTENT_PER_RESULT = 180;
const findingsForWriter = (searchResults?.searchFindings ?? []).slice(0, MAX_RESULTS_FOR_WRITER);
const MAX_RESULTS_FOR_WRITER = isArticleSummary ? 30 : 25;
const MAX_CONTENT_PER_RESULT = isArticleSummary ? 2000 : 320;
const rawFindings = searchResults?.searchFindings ?? [];
if (isArticleSummary && preScrapedArticle) {
const alreadyHasUrl = rawFindings.some(
(f) => (f.metadata?.url as string)?.includes(preScrapedArticle!.url),
);
if (!alreadyHasUrl) {
rawFindings.unshift({
content: preScrapedArticle.content,
metadata: { url: preScrapedArticle.url, title: preScrapedArticle.title },
});
}
}
const findingsForWriter = rankByRelevance(rawFindings, followUp).slice(0, MAX_RESULTS_FOR_WRITER);
const finalContext =
findingsForWriter
.map((f, index) => {
@@ -104,6 +554,8 @@ export async function runSearchOrchestrator(
config.answerMode,
config.responsePrefs,
config.learningMode,
detectedLanguage,
isArticleSummary,
);
const answerStream = config.llm.streamText({
@@ -117,27 +569,43 @@ export async function runSearchOrchestrator(
let responseBlockId = '';
let hasContent = false;
let accumulatedText = '';
for await (const chunk of answerStream) {
if (!chunk.contentChunk && !responseBlockId) continue;
const chunkText = chunk.contentChunk ?? '';
if (!chunkText && !responseBlockId) continue;
if (!responseBlockId) {
accumulatedText = chunkText;
const block: TextBlock = {
id: crypto.randomUUID(),
type: 'text',
data: chunk.contentChunk ?? '',
data: chunkText,
};
session.emitBlock(block);
responseBlockId = block.id;
if (chunk.contentChunk) hasContent = true;
} else {
if (chunkText) hasContent = true;
} else if (chunkText) {
accumulatedText += chunkText;
hasContent = true;
// Отправляем только новый чанк для немедленного отображения
session.emit('data', {
type: 'textChunk',
blockId: responseBlockId,
chunk: chunkText
});
// Также обновляем полный блок для консистентности
const block = session.getBlock(responseBlockId) as TextBlock | null;
if (block) {
block.data += chunk.contentChunk ?? '';
if (chunk.contentChunk) hasContent = true;
session.updateBlock(block.id, [{ op: 'replace', path: '/data', value: block.data }]);
block.data = accumulatedText;
}
}
}
// Финальное обновление блока
if (responseBlockId) {
session.updateBlock(responseBlockId, [{ op: 'replace', path: '/data', value: accumulatedText }]);
}
if (!hasContent && findingsForWriter.length > 0) {
const lines = findingsForWriter.slice(0, 10).map((f, i) => {