feat: Go backend, enhanced search, new widgets, Docker deploy

Major changes:
- Add Go backend (backend/) with microservices architecture
- Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler,
  proxy-manager, media-search, fastClassifier, language detection
- New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard,
  UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions
- Improved discover-svc with discover-db integration
- Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md)
- Library-svc: project_id schema migration
- Remove deprecated finance-svc and travel-svc
- Localization improvements across services

Made-with: Cursor
This commit is contained in:
home
2026-02-27 04:15:32 +03:00
parent 328d968f3f
commit 06fe57c765
285 changed files with 53132 additions and 1871 deletions

View File

@@ -1,7 +1,9 @@
# syntax=docker/dockerfile:1
FROM node:22-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm install
COPY --from=npm-cache / /tmp/npm-cache
RUN npm install --cache /tmp/npm-cache --prefer-offline --no-audit
COPY tsconfig.json ./
COPY src ./src
RUN npm run build
@@ -9,7 +11,8 @@ RUN npm run build
FROM node:22-alpine
WORKDIR /app
COPY package*.json ./
RUN npm install --omit=dev
COPY --from=npm-cache / /tmp/npm-cache
RUN npm install --omit=dev --cache /tmp/npm-cache --prefer-offline --no-audit
COPY --from=builder /app/dist ./dist
EXPOSE 3018
CMD ["node", "dist/index.js"]

View File

@@ -13,11 +13,14 @@
"@fastify/cors": "^9.0.1",
"@toolsycc/json-repair": "^0.1.22",
"fastify": "^4.28.1",
"franc": "^6.2.0",
"https-proxy-agent": "^7.0.6",
"mathjs": "^15.1.0",
"ollama": "^0.6.3",
"openai": "^6.9.0",
"partial-json": "^0.1.7",
"rfc6902": "^5.1.2",
"socks-proxy-agent": "^8.0.5",
"turndown": "^7.2.2",
"yahoo-finance2": "^3.13.0",
"zod": "^4.1.12"

View File

@@ -141,6 +141,7 @@ app.post<{ Body: unknown }>('/api/v1/agents/search', async (req, reply) => {
if (event === 'data') {
if (d.type === 'block') controller.enqueue(encoder.encode(JSON.stringify({ type: 'block', block: d.block }) + '\n'));
else if (d.type === 'updateBlock') controller.enqueue(encoder.encode(JSON.stringify({ type: 'updateBlock', blockId: d.blockId, patch: d.patch }) + '\n'));
else if (d.type === 'textChunk') controller.enqueue(encoder.encode(JSON.stringify({ type: 'textChunk', blockId: (d as { blockId?: string }).blockId, chunk: (d as { chunk?: string }).chunk }) + '\n'));
else if (d.type === 'researchComplete') controller.enqueue(encoder.encode(JSON.stringify({ type: 'researchComplete' }) + '\n'));
} else if (event === 'end') {
controller.enqueue(encoder.encode(JSON.stringify({ type: 'messageEnd' }) + '\n'));

View File

@@ -17,7 +17,7 @@ const planAction: ResearchAction<typeof schema> = {
schema,
getToolDescription: () =>
'Use this FIRST on every turn to state your plan in natural language before any other action. Keep it short, action-focused, and tailored to the current query.',
getDescription: () => actionDescription,
getDescription: (_config) => actionDescription,
enabled: (config) => config.mode !== 'speed',
execute: async (input) => ({
type: 'reasoning',

View File

@@ -4,22 +4,22 @@ import type { Chunk, SearchResultsResearchBlock } from '../types.js';
import { searchSearxng } from '../searxng.js';
const schema = z.object({
queries: z.array(z.string()).describe('List of academic search queries'),
queries: z.array(z.string()).max(6).describe('List of academic search queries'),
});
const academicSearchAction: ResearchAction<typeof schema> = {
name: 'academic_search',
schema,
getToolDescription: () =>
'Use this tool to perform academic searches for scholarly articles, papers, and research studies. Provide up to 3 queries at a time.',
'Use this tool to perform academic searches for scholarly articles. Provide up to 6 queries in the user\'s language.',
getDescription: () =>
'Use this tool to perform academic searches for scholarly articles and research studies. Provide concise search queries. You can provide up to 3 queries at a time.',
'Use this tool to perform academic searches for scholarly articles and research studies. Up to 6 queries in the user\'s language.',
enabled: (config) =>
config.sources.includes('academic') &&
config.classification.classification.skipSearch === false &&
config.classification.classification.academicSearch === true,
execute: async (input, additionalConfig) => {
input.queries = input.queries.slice(0, 3);
input.queries = input.queries.slice(0, 6);
const researchBlock = additionalConfig.session.getBlock(additionalConfig.researchBlockId);
@@ -39,7 +39,23 @@ const academicSearchAction: ResearchAction<typeof schema> = {
const results: Chunk[] = [];
const search = async (q: string) => {
const res = await searchSearxng(q, { engines: ['arxiv', 'google scholar', 'pubmed'] });
const [page1, page2] = await Promise.all([
searchSearxng(q, {
engines: ['arxiv', 'google scholar', 'pubmed', 'semantic scholar', 'openalex'],
pageno: 1,
}),
searchSearxng(q, {
engines: ['arxiv', 'google scholar', 'pubmed', 'semantic scholar', 'openalex'],
pageno: 2,
}),
]);
const seenUrls = new Set<string>();
const allResults = [...(page1.results ?? []), ...(page2.results ?? [])].filter((r) => {
if (!r.url || seenUrls.has(r.url)) return false;
seenUrls.add(r.url);
return true;
});
const res = { results: allResults };
const resultChunks: Chunk[] = res.results.map((r) => ({
content: r.content || r.title,
metadata: { title: r.title, url: r.url },

View File

@@ -7,7 +7,7 @@ const doneAction: ResearchAction<typeof emptySchema> = {
schema: emptySchema,
getToolDescription: () =>
'Only call this after __reasoning_preamble AND after any other needed tool calls when you truly have enough to answer. Do not call if information is still missing.',
getDescription: () =>
getDescription: (_config) =>
'Use this action ONLY when you have completed all necessary research and are ready to provide a final answer. YOU MUST CALL THIS ACTION TO SIGNAL COMPLETION; DO NOT OUTPUT FINAL ANSWERS DIRECTLY TO THE USER.',
enabled: () => true,
execute: async () => ({ type: 'done' }),

View File

@@ -30,14 +30,14 @@ export function getAvailableActions(config: ActionConfig): ResearchAction[] {
export function getAvailableActionTools(config: ActionConfig): { name: string; description: string; schema: unknown }[] {
return getAvailableActions(config).map((a) => ({
name: a.name,
description: a.getToolDescription({ mode: config.mode }),
description: a.getToolDescription(config),
schema: a.schema,
}));
}
export function getAvailableActionsDescriptions(config: ActionConfig): string {
return getAvailableActions(config)
.map((a) => `<tool name="${a.name}">\n${a.getDescription({ mode: config.mode })}\n</tool>`)
.map((a) => `<tool name="${a.name}">\n${a.getDescription(config)}\n</tool>`)
.join('\n\n');
}

View File

@@ -1,24 +1,114 @@
import z from 'zod';
import TurndownService from 'turndown';
import type { ResearchAction } from './types.js';
import type { Chunk, ReadingResearchBlock } from '../types.js';
const turndownService = new TurndownService();
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
const CRAWL4AI_TIMEOUT_MS = 25_000;
const sessionScrapeCounters = new Map<string, number>();
const SESSION_CLEANUP_MS = 5 * 60 * 1000;
function getMaxScrapesForMode(mode?: 'speed' | 'balanced' | 'quality'): number {
switch (mode) {
case 'speed': return 0;
case 'balanced': return 3;
case 'quality': return 6;
default: return 3;
}
}
function getScrapeCount(sessionId: string): number {
return sessionScrapeCounters.get(sessionId) ?? 0;
}
function incrementScrapeCount(sessionId: string, count: number): void {
const current = sessionScrapeCounters.get(sessionId) ?? 0;
sessionScrapeCounters.set(sessionId, current + count);
setTimeout(() => sessionScrapeCounters.delete(sessionId), SESSION_CLEANUP_MS);
}
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
}
return '';
}
const schema = z.object({
urls: z.array(z.string()).describe('A list of URLs to scrape content from.'),
});
async function scrapeViaCrawl4ai(url: string, useCache = true): Promise<{ title: string; markdown: string }> {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: {
type: 'CrawlerRunConfig',
params: { cache_mode: useCache ? 'default' : 'bypass', page_timeout: 15000 },
},
}),
signal: AbortSignal.timeout(CRAWL4AI_TIMEOUT_MS),
});
if (!res.ok) throw new Error(`Crawl4AI HTTP ${res.status}`);
const data = (await res.json()) as {
results?: { url: string; markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
};
const first = data.results?.[0];
const md = extractCrawl4aiMarkdown(first?.markdown);
if (!first?.success || !md) throw new Error('Crawl4AI returned empty result');
return {
title: first.metadata?.title ?? `Content from ${url}`,
markdown: md.slice(0, 12000),
};
}
async function scrapeFallback(url: string): Promise<{ title: string; markdown: string }> {
const res = await fetch(url, { signal: AbortSignal.timeout(10000) });
const text = await res.text();
const title = text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;
const body = text.match(/<body[^>]*>([\s\S]*?)<\/body>/i)?.[1] ?? text;
const markdown = body
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 12000);
return { title, markdown };
}
const scrapeURLAction: ResearchAction<typeof schema> = {
name: 'scrape_url',
schema,
getToolDescription: () =>
'Use this tool to scrape and extract content from the provided URLs. You can provide up to 3 URLs at a time. NEVER CALL THIS TOOL EXPLICITLY YOURSELF UNLESS INSTRUCTED TO DO SO BY THE USER.',
getDescription: () =>
getDescription: (_config) =>
'Use this tool to scrape content from specific web pages. Only call when the user has specifically requested information from certain URLs. Never call yourself to get extra information without user instruction.',
enabled: () => true,
enabled: (config) => config.mode !== 'speed',
execute: async (params, additionalConfig) => {
params.urls = params.urls.slice(0, 3);
const sessionId = additionalConfig.researchBlockId;
const maxScrapes = getMaxScrapesForMode(additionalConfig.mode);
const currentCount = getScrapeCount(sessionId);
const remainingBudget = Math.max(0, maxScrapes - currentCount);
if (remainingBudget === 0) {
return {
type: 'search_results',
results: [{
content: `Scrape budget exhausted for this session (${maxScrapes} URLs already scraped). Use existing search results.`,
metadata: { url: '', title: 'Scrape limit reached' },
}],
};
}
const urlsToScrape = params.urls.slice(0, Math.min(3, remainingBudget));
incrementScrapeCount(sessionId, urlsToScrape.length);
const researchBlock = additionalConfig.session.getBlock(additionalConfig.researchBlockId);
let readingBlockId = crypto.randomUUID();
@@ -26,18 +116,21 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
const results: Chunk[] = [];
await Promise.all(
params.urls.map(async (url) => {
urlsToScrape.map(async (url: string) => {
try {
const res = await fetch(url, { signal: AbortSignal.timeout(10000) });
const text = await res.text();
const title = text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;
let scraped: { title: string; markdown: string };
try {
scraped = await scrapeViaCrawl4ai(url);
} catch {
scraped = await scrapeFallback(url);
}
if (!readingEmitted && researchBlock && researchBlock.type === 'research') {
readingEmitted = true;
researchBlock.data.subSteps.push({
id: readingBlockId,
type: 'reading',
reading: [{ content: '', metadata: { url, title } }],
reading: [{ content: '', metadata: { url, title: scraped.title } }],
});
additionalConfig.session.updateBlock(additionalConfig.researchBlockId, [
{ op: 'replace', path: '/data/subSteps', value: researchBlock.data.subSteps },
@@ -46,15 +139,14 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
const subStepIndex = researchBlock.data.subSteps.findIndex((s) => s.id === readingBlockId);
const subStep = researchBlock.data.subSteps[subStepIndex] as ReadingResearchBlock | undefined;
if (subStep) {
subStep.reading.push({ content: '', metadata: { url, title } });
subStep.reading.push({ content: '', metadata: { url, title: scraped.title } });
additionalConfig.session.updateBlock(additionalConfig.researchBlockId, [
{ op: 'replace', path: '/data/subSteps', value: researchBlock.data.subSteps },
]);
}
}
const markdown = turndownService.turndown(text);
results.push({ content: markdown, metadata: { url, title } });
results.push({ content: scraped.markdown, metadata: { url, title: scraped.title } });
} catch (error) {
results.push({
content: `Failed to fetch content from ${url}: ${error}`,

View File

@@ -4,22 +4,22 @@ import type { Chunk, SearchResultsResearchBlock } from '../types.js';
import { searchSearxng } from '../searxng.js';
const schema = z.object({
queries: z.array(z.string()).describe('List of social search queries'),
queries: z.array(z.string()).max(6).describe('List of social search queries'),
});
const socialSearchAction: ResearchAction<typeof schema> = {
name: 'social_search',
schema,
getToolDescription: () =>
'Use this tool to perform social media searches for relevant posts, discussions, and trends. Provide up to 3 queries at a time.',
'Use this tool to perform social media searches for posts, discussions, and trends. Provide up to 6 queries in the user\'s language.',
getDescription: () =>
'Use this tool to perform social media searches for posts, discussions, and trends. Provide concise search queries. You can provide up to 3 queries at a time.',
'Use this tool to perform social media searches for posts, discussions, and trends. Up to 6 queries in the user\'s language.',
enabled: (config) =>
config.sources.includes('discussions') &&
config.classification.classification.skipSearch === false &&
config.classification.classification.discussionSearch === true,
execute: async (input, additionalConfig) => {
input.queries = input.queries.slice(0, 3);
input.queries = input.queries.slice(0, 6);
const researchBlock = additionalConfig.session.getBlock(additionalConfig.researchBlockId);
@@ -39,7 +39,23 @@ const socialSearchAction: ResearchAction<typeof schema> = {
const results: Chunk[] = [];
const search = async (q: string) => {
const res = await searchSearxng(q, { engines: ['reddit'] });
const [page1, page2] = await Promise.all([
searchSearxng(q, {
categories: ['social_media'],
pageno: 1,
}),
searchSearxng(q, {
categories: ['social_media'],
pageno: 2,
}),
]);
const seenUrls = new Set<string>();
const allResults = [...(page1.results ?? []), ...(page2.results ?? [])].filter((r) => {
if (!r.url || seenUrls.has(r.url)) return false;
seenUrls.add(r.url);
return true;
});
const res = { results: allResults };
const resultChunks: Chunk[] = res.results.map((r) => ({
content: r.content || r.title,
metadata: { title: r.title, url: r.url },

View File

@@ -24,12 +24,18 @@ export type ActionConfig = {
mode: SearchMode;
sources: SearchSources[];
hasEmbedding?: boolean;
detectedLanguage?: string;
/** Запрос саммари статьи (Summary: url) — веб-поиск только по категориям news/science, макс 5 источников */
isArticleSummary?: boolean;
};
export type AdditionalConfig = {
session: SessionManager;
researchBlockId: string;
fileIds: string[];
isArticleSummary?: boolean;
mode?: SearchMode;
originalQuery?: string;
};
export type SearchActionOutput = { type: 'search_results'; results: Chunk[] };
@@ -42,8 +48,8 @@ export type ToolCall = { id: string; name: string; arguments: Record<string, unk
export interface ResearchAction<TSchema extends z.ZodObject<Record<string, z.ZodTypeAny>> = z.ZodObject<Record<string, z.ZodTypeAny>>> {
name: string;
schema: TSchema;
getToolDescription: (config: { mode: SearchMode }) => string;
getDescription: (config: { mode: SearchMode }) => string;
getToolDescription: (config: ActionConfig) => string;
getDescription: (config: ActionConfig) => string;
enabled: (config: ActionConfig) => boolean;
execute: (
params: z.infer<TSchema>,

View File

@@ -2,22 +2,75 @@ import z from 'zod';
import type { ResearchAction } from './types.js';
import type { Chunk, SearchResultsResearchBlock } from '../types.js';
import { searchSearxng } from '../searxng.js';
import { rerankBM25, computeAdaptiveTopK, estimateQueryComplexity } from '../reranker.js';
const MAX_CONCURRENT_SEARCHES = 5;
const SEARCH_DELAY_MS = 50;
class SearchSemaphore {
private permits: number;
private queue: (() => void)[] = [];
constructor(permits: number) {
this.permits = permits;
}
async acquire(): Promise<void> {
if (this.permits > 0) {
this.permits--;
return;
}
return new Promise((resolve) => {
this.queue.push(resolve);
});
}
release(): void {
const next = this.queue.shift();
if (next) {
next();
} else {
this.permits++;
}
}
async runExclusive<T>(fn: () => Promise<T>): Promise<T> {
await this.acquire();
try {
return await fn();
} finally {
this.release();
}
}
}
const searchSemaphore = new SearchSemaphore(MAX_CONCURRENT_SEARCHES);
async function rateLimitedSearch(
query: string,
opts: { categories?: string[]; pageno?: number },
): Promise<{ results: { content?: string; title: string; url: string }[] }> {
return searchSemaphore.runExclusive(async () => {
await new Promise((r) => setTimeout(r, SEARCH_DELAY_MS));
return searchSearxng(query, opts);
});
}
const schema = z.object({
queries: z.array(z.string()).describe('An array of search queries to perform web searches for.'),
queries: z.array(z.string()).max(6).describe('An array of search queries to perform web searches for.'),
});
const webSearchAction: ResearchAction<typeof schema> = {
name: 'web_search',
schema,
getToolDescription: () =>
'Use this tool to perform web searches based on the provided queries. You can provide up to 3 queries at a time.',
'Use this tool to perform web searches. Provide up to 6 queries in the user\'s language.',
getDescription: () =>
'Use this tool to perform web searches. Your queries should be targeted and specific, SEO-friendly keywords. You can search for 3 queries in one go.',
'Use this tool to perform web searches. Queries should be targeted, SEO-friendly. Up to 6 queries in the user\'s language.',
enabled: (config) =>
config.sources.includes('web') && config.classification.classification.skipSearch === false,
execute: async (input, additionalConfig) => {
input.queries = input.queries.slice(0, 3);
input.queries = input.queries.slice(0, 6);
const researchBlock = additionalConfig.session.getBlock(additionalConfig.researchBlockId);
@@ -35,15 +88,45 @@ const webSearchAction: ResearchAction<typeof schema> = {
const searchResultsBlockId = crypto.randomUUID();
let searchResultsEmitted = false;
const results: Chunk[] = [];
// Саммари по ссылке: только новости + наука. Обычный чат: текстовые категории (без images/videos/files/music/map/social_media).
const isArticleSummary = !!additionalConfig.isArticleSummary;
const TEXT_CATEGORIES = ['general', 'science', 'it', 'news'] as const;
const opts = isArticleSummary
? { categories: ['news', 'science'] as string[], pageno: 1 as number }
: { categories: [...TEXT_CATEGORIES] };
const search = async (q: string) => {
let res: { results: { content?: string; title: string; url: string }[] };
let allRawResults: { content?: string; title: string; url: string }[] = [];
try {
res = await searchSearxng(q);
if (isArticleSummary) {
const pages = await Promise.allSettled([
rateLimitedSearch(q, { ...opts, pageno: 1 }),
rateLimitedSearch(q, { ...opts, pageno: 2 }),
]);
const seenUrls = new Set<string>();
for (const p of pages) {
if (p.status !== 'fulfilled') continue;
for (const r of p.value.results ?? []) {
if (r.url && !seenUrls.has(r.url)) {
seenUrls.add(r.url);
allRawResults.push(r);
}
}
}
} else {
const res = await rateLimitedSearch(q, { ...opts, pageno: 1 });
const seenUrls = new Set<string>();
for (const r of res.results ?? []) {
if (r.url && !seenUrls.has(r.url)) {
seenUrls.add(r.url);
allRawResults.push(r);
}
}
}
} catch {
return;
}
const resultChunks: Chunk[] = res.results.map((r) => ({
const resultChunks: Chunk[] = allRawResults.map((r) => ({
content: r.content || r.title,
metadata: { title: r.title, url: r.url },
}));
@@ -73,7 +156,30 @@ const webSearchAction: ResearchAction<typeof schema> = {
await Promise.all(input.queries.map(search));
return { type: 'search_results', results };
if (results.length === 0) {
return { type: 'search_results', results };
}
const originalQuery = additionalConfig.originalQuery ?? input.queries.join(' ');
const mode = additionalConfig.mode ?? 'balanced';
const queryComplexity = estimateQueryComplexity(originalQuery);
const adaptiveTopK = computeAdaptiveTopK(results.length, queryComplexity, mode);
const rerankableItems = results.map((r) => ({
content: r.content,
title: (r.metadata?.title as string) ?? '',
url: (r.metadata?.url as string) ?? '',
metadata: r.metadata,
}));
const rankedItems = rerankBM25(rerankableItems, originalQuery, adaptiveTopK);
const rankedResults: Chunk[] = rankedItems.map((item) => ({
content: item.content,
metadata: item.metadata ?? { title: item.title, url: item.url },
}));
return { type: 'search_results', results: rankedResults };
},
};

View File

@@ -21,13 +21,14 @@ export type ClassifierInput = {
query: string;
llm: LlmClient;
locale?: string;
detectedLanguage?: string;
enabledSources: ('web' | 'discussions' | 'academic')[];
};
export async function classify(input: ClassifierInput): Promise<z.infer<typeof schema>> {
const output = await input.llm.generateObject<z.infer<typeof schema>>({
messages: [
{ role: 'system', content: getClassifierPrompt(input.locale) },
{ role: 'system', content: getClassifierPrompt(input.locale, input.detectedLanguage) },
{
role: 'user',
content: `<conversation_history>\n${formatChatHistoryAsString(input.chatHistory)}\n</conversation_history>\n<user_query>\n${input.query}\n</user_query>`,

View File

@@ -0,0 +1,109 @@
/**
* Fast rule-based classifier for speed mode
* No LLM calls — instant classification based on patterns
*/
export interface FastClassifierOutput {
classification: {
skipSearch: boolean;
personalSearch: boolean;
academicSearch: boolean;
discussionSearch: boolean;
showWeatherWidget: boolean;
showStockWidget: boolean;
showCalculationWidget: boolean;
};
standaloneFollowUp: string;
}
const WEATHER_PATTERNS = /погод[аеуы]|temperature|weather|forecast|прогноз|климат|осадки|дожд[ьи]|снег|температур/i;
const STOCK_PATTERNS = /акци[яиейю]|stock|share|котировк|биржа|nasdaq|nyse|s&p|доллар курс|евро курс|рубл/i;
const CALC_PATTERNS = /\d+\s*[\+\-\*\/\^]\s*\d+|калькул|calculate|вычисл|посчита|convert|конверт|сколько.*в.*\d/i;
const ACADEMIC_PATTERNS = /научн|research|paper|статья.*журнал|диссертац|pubmed|scholar|исследован.*показ/i;
const DISCUSSION_PATTERNS = /отзыв|review|мнени|opinion|форум|reddit|обсужд|опыт.*использ|кто.*пробовал/i;
const GREETING_PATTERNS = /^(привет|здравствуй|добр|hi|hello|hey|good morning|good evening)\b/i;
const SIMPLE_FACT_PATTERNS = /^(что такое|who is|what is|кто такой|какой год|when was|когда был)\s+[a-zA-Zа-яА-Я\s]{2,20}$/i;
function extractStandaloneQuery(query: string, history: { role: string; content: string }[]): string {
const q = query.trim();
const pronouns = /^(это|он|она|оно|они|его|её|их|этот|эта|эти|it|this|that|they|them|he|she)\b/i;
if (pronouns.test(q) && history.length > 0) {
const lastUserMsg = [...history].reverse().find(m => m.role === 'user');
if (lastUserMsg) {
const context = lastUserMsg.content.match(/(?:о|про|about|regarding)\s+([^,.!?]+)/i)?.[1];
if (context) {
return q.replace(pronouns, context);
}
}
}
return q;
}
export function fastClassify(
query: string,
history: { role: string; content: string }[] = [],
): FastClassifierOutput {
const q = query.toLowerCase().trim();
const standaloneFollowUp = extractStandaloneQuery(query, history);
if (GREETING_PATTERNS.test(q) && q.length < 30) {
return {
classification: {
skipSearch: true,
personalSearch: false,
academicSearch: false,
discussionSearch: false,
showWeatherWidget: false,
showStockWidget: false,
showCalculationWidget: false,
},
standaloneFollowUp,
};
}
const showWeatherWidget = WEATHER_PATTERNS.test(q);
const showStockWidget = STOCK_PATTERNS.test(q);
const showCalculationWidget = CALC_PATTERNS.test(q);
const academicSearch = ACADEMIC_PATTERNS.test(q);
const discussionSearch = DISCUSSION_PATTERNS.test(q);
const skipSearch = (showWeatherWidget || showStockWidget || showCalculationWidget) &&
!q.includes('новост') && !q.includes('news') && q.length < 50;
return {
classification: {
skipSearch,
personalSearch: false,
academicSearch,
discussionSearch,
showWeatherWidget,
showStockWidget,
showCalculationWidget,
},
standaloneFollowUp,
};
}
export function generateSearchQueries(query: string): string[] {
const q = query.trim();
const queries: string[] = [q];
if (q.length > 15 && !q.includes('"')) {
const words = q.split(/\s+/).filter(w => w.length > 2);
if (words.length >= 4) {
queries.push(words.slice(0, Math.ceil(words.length / 2)).join(' '));
}
}
const locationMatch = q.match(/в\s+([а-яё]+е|[а-яё]+и)\b/i);
if (locationMatch) {
const withoutLocation = q.replace(locationMatch[0], '').trim();
if (withoutLocation.length > 5) {
queries.push(`${withoutLocation} ${locationMatch[1]}`);
}
}
return queries.slice(0, 4);
}

View File

@@ -6,12 +6,16 @@ import type { ClassifierOutput } from '../actions/types.js';
import { getResearcherPrompt } from '../prompts/researcher.js';
import { getAvailableActionTools, getAvailableActionsDescriptions, executeAll } from '../actions/registry.js';
import formatChatHistoryAsString from '../utils/formatHistory.js';
import { classifySearchResults, shouldGenerateKnowledgeCard } from '../content-classifier.js';
import type { ProductData, VideoData, ProfileData, PromoData, ClassifiedResult } from '../types/widgets.js';
export type ResearcherConfig = {
mode: 'speed' | 'balanced' | 'quality';
sources: ('web' | 'discussions' | 'academic')[];
fileIds: string[];
locale?: string;
detectedLanguage?: string;
isArticleSummary?: boolean;
};
export type ResearcherInput = {
@@ -32,7 +36,7 @@ export async function research(
llm: LlmClient,
input: ResearcherInput,
): Promise<ResearcherOutput> {
const maxIteration = input.config.mode === 'speed' ? 2 : input.config.mode === 'balanced' ? 6 : 25;
const maxIteration = input.config.mode === 'speed' ? 1 : input.config.mode === 'balanced' ? 3 : 10;
const actionConfig = {
classification: input.classification,
@@ -40,6 +44,8 @@ export async function research(
mode: input.config.mode,
sources: input.config.sources,
hasEmbedding: false,
detectedLanguage: input.config.detectedLanguage,
isArticleSummary: input.config.isArticleSummary,
};
const availableTools = getAvailableActionTools(actionConfig);
@@ -69,6 +75,8 @@ export async function research(
maxIteration,
'',
input.config.locale,
input.config.detectedLanguage,
input.config.isArticleSummary,
);
const toolsForLlm = availableTools.map((t) => ({
@@ -147,6 +155,9 @@ export async function research(
session,
researchBlockId,
fileIds: input.config.fileIds,
isArticleSummary: input.config.isArticleSummary,
mode: input.config.mode,
originalQuery: input.classification.standaloneFollowUp || input.followUp,
});
actionOutput.push(...results);
@@ -189,7 +200,109 @@ export async function research(
data: filteredSearchResults,
});
const query = input.classification.standaloneFollowUp || input.followUp;
const rawResults = filteredSearchResults.map((chunk) => ({
url: chunk.metadata?.url as string || '',
title: chunk.metadata?.title as string || '',
content: chunk.content,
thumbnail: chunk.metadata?.thumbnail as string | undefined,
}));
const classificationResult = classifySearchResults({ results: rawResults, query });
const classified = classificationResult.classified;
const products = classified.filter((r): r is typeof r & { data: ProductData } => r.type === 'product' && r.data !== undefined).map((r) => r.data);
const videos = classified.filter((r): r is typeof r & { data: VideoData } => r.type === 'video' && r.data !== undefined).map((r) => r.data);
const profiles = classified.filter((r): r is typeof r & { data: ProfileData } => r.type === 'profile' && r.data !== undefined).map((r) => r.data);
const promos = classified.filter((r): r is typeof r & { data: PromoData } => r.type === 'promo' && r.data !== undefined).map((r) => r.data);
const images: ClassifiedResult[] = classified.filter((r) => r.type === 'image');
if (products.length >= 2) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'products',
params: { items: products, title: 'Товары' },
},
});
}
if (videos.length >= 2) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'videos',
params: { items: videos, title: 'Видео' },
},
});
}
if (profiles.length >= 2) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'profiles',
params: { items: profiles, title: 'Профили' },
},
});
}
if (promos.length >= 1) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'promos',
params: { items: promos, title: 'Промокоды' },
},
});
}
if (images.length >= 3) {
const imageData = images.slice(0, 8).map((img: ClassifiedResult) => ({
url: img.rawResult.thumbnail || img.rawResult.url,
title: img.rawResult.title,
source: img.source,
sourceUrl: img.rawResult.url,
}));
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'image_gallery',
params: { images: imageData, layout: 'grid' },
},
});
}
if (shouldGenerateKnowledgeCard(query)) {
const cardType = detectKnowledgeCardType(query);
if (cardType && ['comparison_table', 'stat_card'].includes(cardType)) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'knowledge_card_hint',
params: { suggestedType: cardType, query },
},
});
}
}
return { searchFindings: filteredSearchResults };
}
function detectKnowledgeCardType(query: string): string | null {
const q = query.toLowerCase();
if (/сравни|vs\.?|или|лучше|разница между/.test(q)) return 'comparison_table';
if (/статистика|процент|количество|сколько/.test(q)) return 'stat_card';
if (/график|динамика|рост|тренд/.test(q)) return 'line_chart';
if (/что такое|определение|понятие/.test(q)) return 'definition';
return null;
}
type ResearchBlockSubStep = { id: string; type: string; reasoning?: string; reading?: Chunk[]; searching?: string[] };

View File

@@ -1,12 +1,136 @@
import type { LlmClient } from '../llm-client.js';
import SessionManager from '../session.js';
import type { TextBlock } from '../types.js';
import type { TextBlock, Chunk } from '../types.js';
import type { ClassifierOutput } from '../actions/types.js';
import { detectLanguage } from '../prompts/detectLanguage.js';
import { getClassifierPrompt } from '../prompts/classifier.js';
import { getWriterPrompt } from '../prompts/writer.js';
import { classify } from './classifier.js';
import { fastClassify, generateSearchQueries } from './fastClassifier.js';
import { research } from './researcher.js';
import { executeAllWidgets } from '../widgets/index.js';
import { searchMedia, type MediaSearchResult } from '../media-search.js';
import { searchSearxng } from '../searxng.js';
import { rerankBM25, computeAdaptiveTopK, estimateQueryComplexity } from '../reranker.js';
const DISCOVER_SVC_URL = (process.env.DISCOVER_SVC_URL ?? '').replace(/\/$/, '');
interface DigestResponse {
summaryRu: string;
citations: { index: number; url: string; title: string; domain: string }[];
followUp: string[];
sourcesCount: number;
clusterTitle: string;
}
async function fetchPreGeneratedDigest(articleUrl: string): Promise<DigestResponse | null> {
if (!DISCOVER_SVC_URL) return null;
try {
const res = await fetch(
`${DISCOVER_SVC_URL}/api/v1/discover/digest?url=${encodeURIComponent(articleUrl)}`,
{ signal: AbortSignal.timeout(3000) },
);
if (!res.ok) return null;
const data = (await res.json()) as DigestResponse;
if (data.summaryRu && data.citations?.length > 0) return data;
return null;
} catch {
return null;
}
}
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
}
return '';
}
async function preScrapeArticleUrl(url: string): Promise<{ title: string; content: string; url: string } | null> {
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: { type: 'CrawlerRunConfig', params: { cache_mode: 'default', page_timeout: 20000 } },
}),
signal: AbortSignal.timeout(25000),
});
if (!res.ok) throw new Error(`Crawl4AI HTTP ${res.status}`);
const data = (await res.json()) as {
results?: { markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
};
const first = data.results?.[0];
const md = extractCrawl4aiMarkdown(first?.markdown);
if (first?.success && md.length > 100) {
return {
title: first.metadata?.title ?? url,
content: md.slice(0, 15000),
url,
};
}
} catch {
// fallback to basic fetch
}
try {
const res = await fetch(url, {
signal: AbortSignal.timeout(10000),
headers: { 'User-Agent': 'GooSeek-Agent/1.0' },
});
const html = await res.text();
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
const text = body
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 15000);
if (text.length > 100) return { title, content: text, url };
} catch {
// give up
}
return null;
}
/** Слова запроса (длина >= 2) для подсчёта релевантности */
function queryTerms(query: string): Set<string> {
const normalized = query
.replace(/^Summary:\s*/i, '')
.replace(/https?:\/\/[^\s]+/g, '')
.toLowerCase();
const words = normalized.split(/\s+/).filter((w) => w.length >= 2);
return new Set(words);
}
/** Скор релевантности: сколько слов запроса встречается в title и content */
function relevanceScore(chunk: Chunk, terms: Set<string>): number {
const title = String(chunk.metadata?.title ?? '');
const content = (chunk.content ?? '').toLowerCase();
const titleLower = title.toLowerCase();
let score = 0;
for (const t of terms) {
if (titleLower.includes(t)) score += 3;
if (content.includes(t)) score += 1;
}
return score;
}
function rankByRelevance(chunks: Chunk[], query: string): Chunk[] {
if (chunks.length === 0) return [];
const terms = queryTerms(query);
if (terms.size === 0) return chunks;
return [...chunks].sort((a, b) => relevanceScore(b, terms) - relevanceScore(a, terms));
}
export type SearchOrchestratorConfig = {
llm: LlmClient;
@@ -27,20 +151,287 @@ export type SearchOrchestratorInput = {
config: SearchOrchestratorConfig;
};
/**
* SPEED MODE: Optimized fast path
* - No LLM classifier (rule-based)
* - Direct SearXNG search (no researcher loop)
* - No page scraping (snippets only)
* - Parallel: search + media + widgets
* Target: 8-15 seconds (like Perplexity)
*/
async function runSpeedMode(
session: SessionManager,
input: {
chatHistory: { role: string; content: string }[];
followUp: string;
config: SearchOrchestratorConfig;
detectedLanguage?: string;
},
): Promise<void> {
const { chatHistory, followUp, config, detectedLanguage } = input;
const classification = fastClassify(followUp, chatHistory);
const searchQuery = classification.standaloneFollowUp || followUp;
const queries = generateSearchQueries(searchQuery);
const researchBlockId = crypto.randomUUID();
session.emitBlock({
id: researchBlockId,
type: 'research',
data: { subSteps: [{ id: crypto.randomUUID(), type: 'searching', searching: queries }] },
});
const searchPromise = (async () => {
const results: Chunk[] = [];
const seenUrls = new Set<string>();
await Promise.all(
queries.map(async (q) => {
try {
const res = await searchSearxng(q, { categories: ['general', 'news'], pageno: 1 });
for (const r of res.results ?? []) {
if (r.url && !seenUrls.has(r.url)) {
seenUrls.add(r.url);
results.push({
content: r.content || r.title,
metadata: { title: r.title, url: r.url },
});
}
}
} catch { /* ignore search errors */ }
}),
);
return results;
})();
const widgetPromise = executeAllWidgets({
chatHistory,
followUp,
classification: classification as ClassifierOutput,
llm: config.llm,
}).then((outputs) => {
for (const o of outputs) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: { widgetType: o.type, params: o.data ?? {} },
});
}
return outputs;
});
const mediaPromise = searchMedia(searchQuery, { maxImages: 6, maxVideos: 4 }).then((mediaResult) => {
if (mediaResult.images.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'image_gallery',
params: { images: mediaResult.images, layout: 'carousel' },
},
});
}
if (mediaResult.videos.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'videos',
params: { items: mediaResult.videos, title: '' },
},
});
}
return mediaResult;
});
const [searchResults, widgetOutputs] = await Promise.all([
searchPromise,
widgetPromise,
mediaPromise,
]);
session.emitBlock({
id: crypto.randomUUID(),
type: 'source',
data: searchResults,
});
session.emit('data', { type: 'researchComplete' });
const queryComplexity = estimateQueryComplexity(searchQuery);
const adaptiveTopK = computeAdaptiveTopK(searchResults.length, queryComplexity, 'speed');
const rerankableItems = searchResults.map((r) => ({
content: r.content,
title: (r.metadata?.title as string) ?? '',
url: (r.metadata?.url as string) ?? '',
metadata: r.metadata,
}));
const rankedItems = rerankBM25(rerankableItems, searchQuery, adaptiveTopK);
const findingsForWriter = rankedItems.slice(0, 15).map((item) => ({
content: item.content,
metadata: item.metadata ?? { title: item.title, url: item.url },
}));
const MAX_CONTENT_PER_RESULT = 250;
const finalContext = findingsForWriter
.map((f, index) => {
const content = f.content.length > MAX_CONTENT_PER_RESULT
? f.content.slice(0, MAX_CONTENT_PER_RESULT) + '…'
: f.content;
return `<result index=${index + 1} title="${String(f.metadata?.title ?? '').replace(/"/g, "'")}">${content}</result>`;
})
.join('\n') || '';
const widgetContext = widgetOutputs
.map((o) => `<result>${o.llmContext}</result>`)
.join('\n-------------\n');
const finalContextWithWidgets =
`<search_results note="These are the search results and assistant can cite these">\n${finalContext}\n</search_results>\n` +
`<widgets_result noteForAssistant="Its output is already showed to the user, assistant can use this information to answer the query but do not CITE this as a source">\n${widgetContext}\n</widgets_result>`;
const writerPrompt = getWriterPrompt(
finalContextWithWidgets,
config.systemInstructions,
'speed',
config.locale,
config.memoryContext,
config.answerMode,
config.responsePrefs,
config.learningMode,
detectedLanguage,
false,
);
const answerStream = config.llm.streamText({
messages: [
{ role: 'system', content: writerPrompt },
...chatHistory,
{ role: 'user', content: followUp },
],
options: { maxTokens: 2048 },
});
let responseBlockId = '';
let hasContent = false;
let accumulatedText = '';
for await (const chunk of answerStream) {
const chunkText = chunk.contentChunk ?? '';
if (!chunkText && !responseBlockId) continue;
if (!responseBlockId) {
accumulatedText = chunkText;
const block: TextBlock = {
id: crypto.randomUUID(),
type: 'text',
data: chunkText,
};
session.emitBlock(block);
responseBlockId = block.id;
if (chunkText) hasContent = true;
} else if (chunkText) {
accumulatedText += chunkText;
hasContent = true;
session.emit('data', {
type: 'textChunk',
blockId: responseBlockId,
chunk: chunkText,
});
const block = session.getBlock(responseBlockId) as TextBlock | null;
if (block) {
block.data = accumulatedText;
}
}
}
if (responseBlockId) {
session.updateBlock(responseBlockId, [{ op: 'replace', path: '/data', value: accumulatedText }]);
}
if (!hasContent && findingsForWriter.length > 0) {
const lines = findingsForWriter.slice(0, 8).map((f, i) => {
const title = (f.metadata?.title as string) ?? 'Без названия';
const excerpt = f.content.length > 100 ? f.content.slice(0, 100) + '…' : f.content;
return `${i + 1}. **${title}** — ${excerpt}`;
});
session.emitBlock({
id: crypto.randomUUID(),
type: 'text',
data: `## По найденным источникам\n\n${lines.join('\n\n')}\n\n*Ответ LLM недоступен.*`,
});
}
session.emit('end', {});
}
export async function runSearchOrchestrator(
session: SessionManager,
input: SearchOrchestratorInput,
): Promise<void> {
const { chatHistory, followUp, config } = input;
const detectedLanguage = detectLanguage(followUp);
const isArticleSummary = followUp.trim().startsWith('Summary: ') && followUp.trim().length > 9;
const classification = await classify({
// SPEED MODE: Fast path — no LLM classifier, direct search, no scraping
if (config.mode === 'speed' && !isArticleSummary) {
await runSpeedMode(session, { chatHistory, followUp, config, detectedLanguage });
return;
}
let preScrapedArticle: { title: string; content: string; url: string } | null = null;
if (isArticleSummary) {
const articleUrl = followUp.trim().slice(9).trim();
const [digest, preScrapeResult] = await Promise.all([
fetchPreGeneratedDigest(articleUrl),
preScrapeArticleUrl(articleUrl),
]);
if (digest) {
const sourceBlock = {
id: crypto.randomUUID(),
type: 'source' as const,
data: digest.citations.map((c: { index: number; url: string; title: string; domain: string }) => ({
content: c.title,
metadata: { url: c.url, title: c.title, domain: c.domain, index: c.index },
})),
};
session.emitBlock(sourceBlock);
session.emit('data', { type: 'researchComplete' });
let summaryText = digest.summaryRu;
if (digest.followUp && digest.followUp.length > 0) {
summaryText += '\n\n---\n' + digest.followUp.map((q: string) => `> ${q}`).join('\n');
}
session.emitBlock({ id: crypto.randomUUID(), type: 'text', data: summaryText });
session.emit('end', {});
return;
}
preScrapedArticle = preScrapeResult;
}
let classification = await classify({
chatHistory,
query: followUp,
llm: config.llm,
locale: config.locale,
detectedLanguage,
enabledSources: config.sources,
});
if (isArticleSummary && classification.classification.skipSearch) {
classification = {
...classification,
classification: { ...classification.classification, skipSearch: false },
};
}
const widgetPromise = executeAllWidgets({
chatHistory,
followUp,
@@ -57,28 +448,87 @@ export async function runSearchOrchestrator(
return outputs;
});
const mediaQuery = classification.standaloneFollowUp || followUp;
const mediaPromise = !isArticleSummary
? searchMedia(mediaQuery, { maxImages: 8, maxVideos: 6 }).then((mediaResult) => {
if (mediaResult.images.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'image_gallery',
params: {
images: mediaResult.images,
layout: 'carousel',
},
},
});
}
if (mediaResult.videos.length > 0) {
session.emitBlock({
id: crypto.randomUUID(),
type: 'widget',
data: {
widgetType: 'videos',
params: {
items: mediaResult.videos,
title: '',
},
},
});
}
return mediaResult;
})
: Promise.resolve({ images: [], videos: [] } as MediaSearchResult);
let searchPromise: Promise<{ searchFindings: import('../types.js').Chunk[] }> | null = null;
const effectiveFollowUp = (isArticleSummary && preScrapedArticle?.title)
? `Summary: ${preScrapedArticle.url}\nArticle title: ${preScrapedArticle.title}`
: followUp;
if (!classification.classification.skipSearch) {
searchPromise = research(session, config.llm, {
chatHistory,
followUp,
classification,
followUp: effectiveFollowUp,
classification: isArticleSummary && preScrapedArticle
? { ...classification, standaloneFollowUp: `${preScrapedArticle.title} ${classification.standaloneFollowUp}` }
: classification,
config: {
mode: config.mode,
sources: config.sources,
fileIds: config.fileIds,
locale: config.locale,
detectedLanguage,
isArticleSummary,
},
});
}
const [widgetOutputs, searchResults] = await Promise.all([widgetPromise, searchPromise ?? Promise.resolve({ searchFindings: [] })]);
const [widgetOutputs, searchResults] = await Promise.all([
widgetPromise,
searchPromise ?? Promise.resolve({ searchFindings: [] }),
mediaPromise,
]);
session.emit('data', { type: 'researchComplete' });
const MAX_RESULTS_FOR_WRITER = 15;
const MAX_CONTENT_PER_RESULT = 180;
const findingsForWriter = (searchResults?.searchFindings ?? []).slice(0, MAX_RESULTS_FOR_WRITER);
const MAX_RESULTS_FOR_WRITER = isArticleSummary ? 30 : 25;
const MAX_CONTENT_PER_RESULT = isArticleSummary ? 2000 : 320;
const rawFindings = searchResults?.searchFindings ?? [];
if (isArticleSummary && preScrapedArticle) {
const alreadyHasUrl = rawFindings.some(
(f) => (f.metadata?.url as string)?.includes(preScrapedArticle!.url),
);
if (!alreadyHasUrl) {
rawFindings.unshift({
content: preScrapedArticle.content,
metadata: { url: preScrapedArticle.url, title: preScrapedArticle.title },
});
}
}
const findingsForWriter = rankByRelevance(rawFindings, followUp).slice(0, MAX_RESULTS_FOR_WRITER);
const finalContext =
findingsForWriter
.map((f, index) => {
@@ -104,6 +554,8 @@ export async function runSearchOrchestrator(
config.answerMode,
config.responsePrefs,
config.learningMode,
detectedLanguage,
isArticleSummary,
);
const answerStream = config.llm.streamText({
@@ -117,27 +569,43 @@ export async function runSearchOrchestrator(
let responseBlockId = '';
let hasContent = false;
let accumulatedText = '';
for await (const chunk of answerStream) {
if (!chunk.contentChunk && !responseBlockId) continue;
const chunkText = chunk.contentChunk ?? '';
if (!chunkText && !responseBlockId) continue;
if (!responseBlockId) {
accumulatedText = chunkText;
const block: TextBlock = {
id: crypto.randomUUID(),
type: 'text',
data: chunk.contentChunk ?? '',
data: chunkText,
};
session.emitBlock(block);
responseBlockId = block.id;
if (chunk.contentChunk) hasContent = true;
} else {
if (chunkText) hasContent = true;
} else if (chunkText) {
accumulatedText += chunkText;
hasContent = true;
// Отправляем только новый чанк для немедленного отображения
session.emit('data', {
type: 'textChunk',
blockId: responseBlockId,
chunk: chunkText
});
// Также обновляем полный блок для консистентности
const block = session.getBlock(responseBlockId) as TextBlock | null;
if (block) {
block.data += chunk.contentChunk ?? '';
if (chunk.contentChunk) hasContent = true;
session.updateBlock(block.id, [{ op: 'replace', path: '/data', value: block.data }]);
block.data = accumulatedText;
}
}
}
// Финальное обновление блока
if (responseBlockId) {
session.updateBlock(responseBlockId, [{ op: 'replace', path: '/data', value: accumulatedText }]);
}
if (!hasContent && findingsForWriter.length > 0) {
const lines = findingsForWriter.slice(0, 10).map((f, i) => {

View File

@@ -0,0 +1,515 @@
/**
* Content Classifier for GooSeek
* Determines content type from URL patterns and query keywords
*/
import {
ContentType,
SourcePlatform,
ClassifiedResult,
ProductData,
VideoData,
ProfileData,
PromoData,
} from './types/widgets.js';
// ============================================
// URL PATTERNS
// ============================================
const URL_PATTERNS: Record<ContentType, RegExp[]> = {
product: [
/ozon\.ru\/product\//i,
/wildberries\.ru\/catalog\/\d+/i,
/aliexpress\.(ru|com)\/item\//i,
/market\.yandex\.ru\/product\//i,
/beru\.ru\/product\//i,
/goods\.ru\/catalog\//i,
/mvideo\.ru\/products\//i,
/dns-shop\.ru\/product\//i,
/citilink\.ru\/product\//i,
/eldorado\.ru\/cat\/detail\//i,
/lamoda\.ru\/p\//i,
/amazon\.(com|ru)\/dp\//i,
],
video: [
/rutube\.ru\/video\//i,
/vk\.com\/video/i,
/vk\.com\/clip/i,
/youtube\.com\/watch/i,
/youtu\.be\//i,
/dzen\.ru\/video\//i,
/ok\.ru\/video\//i,
/tiktok\.com\/@[\w.-]+\/video/i,
],
profile: [
/vk\.com\/(?!video|clip|wall|photo|doc|audio)[a-zA-Z0-9_.]+$/i,
/t\.me\/[a-zA-Z0-9_]+$/i,
/instagram\.com\/[a-zA-Z0-9_.]+\/?$/i,
/twitter\.com\/[a-zA-Z0-9_]+\/?$/i,
/x\.com\/[a-zA-Z0-9_]+\/?$/i,
/facebook\.com\/[a-zA-Z0-9.]+\/?$/i,
/youtube\.com\/@/i,
/youtube\.com\/channel\//i,
/dzen\.ru\/[a-zA-Z0-9_-]+$/i,
/ok\.ru\/profile\//i,
],
promo: [
/promokod/i,
/coupon/i,
/скидк/i,
/cuponation/i,
/promocode/i,
/\.ru\/promo\//i,
/discount/i,
],
image: [
/\.(jpg|jpeg|png|gif|webp|svg)(\?|$)/i,
/images\./i,
/img\./i,
/photo\./i,
],
article: [],
knowledge_card: [],
};
// ============================================
// PLATFORM DETECTION
// ============================================
const PLATFORM_PATTERNS: Record<SourcePlatform, RegExp> = {
yandex: /yandex\.(ru|com)|ya\.ru|dzen\.ru/i,
vk: /vk\.com|vkontakte\.ru/i,
rutube: /rutube\.ru/i,
ozon: /ozon\.ru/i,
wildberries: /wildberries\.ru/i,
aliexpress: /aliexpress\.(ru|com)/i,
youtube: /youtube\.com|youtu\.be/i,
telegram: /t\.me|telegram\.(org|me)/i,
dzen: /dzen\.ru/i,
other: /.*/,
};
// ============================================
// QUERY PATTERNS (для определения намерения)
// ============================================
const QUERY_PATTERNS: Record<ContentType, RegExp[]> = {
product: [
/купить/i,
/цена/i,
/стоимость/i,
/заказать/i,
/где купить/i,
/сколько стоит/i,
/отзывы о/i,
/обзор .*(товар|продукт|гаджет|телефон|ноутбук)/i,
/лучший .*(телефон|смартфон|ноутбук|планшет)/i,
/топ \d+/i,
/рейтинг/i,
/сравнение/i,
],
video: [
/видео/i,
/смотреть/i,
/как сделать/i,
/туториал/i,
/обзор/i,
/влог/i,
/трейлер/i,
/клип/i,
/фильм/i,
/сериал/i,
],
profile: [
/профиль/i,
/страница/i,
/канал/i,
/блогер/i,
/инстаграм/i,
/телеграм/i,
/вконтакте/i,
/подписчик/i,
],
promo: [
/промокод/i,
/скидка/i,
/купон/i,
/акция/i,
/распродажа/i,
/бесплатная доставка/i,
/скидочный код/i,
/промо/i,
],
image: [
/картинк/i,
/фото/i,
/изображен/i,
/обои/i,
/wallpaper/i,
],
article: [],
knowledge_card: [
/что такое/i,
/определение/i,
/сравни/i,
/vs/i,
/против/i,
/разница между/i,
/статистика/i,
/график/i,
/динамика/i,
],
};
// ============================================
// CLASSIFIER FUNCTIONS
// ============================================
/**
* Определяет платформу по URL
*/
export function detectPlatform(url: string): SourcePlatform {
for (const [platform, pattern] of Object.entries(PLATFORM_PATTERNS)) {
if (pattern.test(url)) {
return platform as SourcePlatform;
}
}
return 'other';
}
/**
* Определяет тип контента по URL
*/
export function classifyByUrl(url: string): ContentType | null {
for (const [type, patterns] of Object.entries(URL_PATTERNS)) {
for (const pattern of patterns) {
if (pattern.test(url)) {
return type as ContentType;
}
}
}
return null;
}
/**
* Определяет тип контента по запросу пользователя
*/
export function classifyByQuery(query: string): ContentType | null {
for (const [type, patterns] of Object.entries(QUERY_PATTERNS)) {
for (const pattern of patterns) {
if (pattern.test(query)) {
return type as ContentType;
}
}
}
return null;
}
/**
* Определяет, является ли результат видео
*/
export function isVideoResult(result: { url: string; content?: string; iframe_src?: string }): boolean {
if (result.iframe_src) return true;
return URL_PATTERNS.video.some(p => p.test(result.url));
}
/**
* Определяет, является ли результат товаром
*/
export function isProductResult(result: { url: string; content?: string }): boolean {
return URL_PATTERNS.product.some(p => p.test(result.url));
}
/**
* Определяет, является ли результат профилем
*/
export function isProfileResult(result: { url: string }): boolean {
return URL_PATTERNS.profile.some(p => p.test(result.url));
}
/**
* Определяет, является ли результат промокодом
*/
export function isPromoResult(result: { url: string; content?: string }, query: string): boolean {
const urlMatch = URL_PATTERNS.promo.some(p => p.test(result.url));
const contentMatch = result.content && /промокод|скидк|купон/i.test(result.content);
const queryMatch = QUERY_PATTERNS.promo.some(p => p.test(query));
return urlMatch || (contentMatch && queryMatch) || false;
}
/**
* Полная классификация результата поиска
*/
export function classifySearchResult(
result: { url: string; title: string; content?: string; thumbnail?: string },
query: string
): ClassifiedResult {
const platform = detectPlatform(result.url);
let type: ContentType = 'article';
let confidence = 0.5;
let data: ProductData | VideoData | ProfileData | PromoData | null = null;
const urlType = classifyByUrl(result.url);
if (urlType) {
type = urlType;
confidence = 0.9;
} else {
const queryType = classifyByQuery(query);
if (queryType && queryType !== 'article') {
type = queryType;
confidence = 0.7;
}
}
if (type === 'video') {
data = extractVideoData(result, platform);
confidence = data ? 0.95 : confidence;
} else if (type === 'product') {
data = extractProductData(result, platform);
confidence = data ? 0.95 : confidence;
} else if (type === 'profile') {
data = extractProfileData(result, platform);
confidence = data ? 0.9 : confidence;
}
return {
type,
source: platform,
confidence,
data,
rawResult: result,
};
}
/**
* Группирует классифицированные результаты по типу
*/
export function groupResultsByType(results: ClassifiedResult[]): Record<ContentType, ClassifiedResult[]> {
const grouped: Record<ContentType, ClassifiedResult[]> = {
product: [],
video: [],
profile: [],
promo: [],
article: [],
image: [],
knowledge_card: [],
};
for (const result of results) {
grouped[result.type].push(result);
}
return grouped;
}
// ============================================
// DATA EXTRACTORS
// ============================================
/**
* Извлекает данные видео из результата
*/
function extractVideoData(
result: { url: string; title: string; content?: string; thumbnail?: string },
platform: SourcePlatform
): VideoData | null {
const videoId = extractVideoId(result.url, platform);
if (!videoId && platform !== 'other') return null;
return {
id: videoId || undefined,
title: result.title,
thumbnail: result.thumbnail || '',
url: result.url,
duration: 0,
author: '',
platform: platform as VideoData['platform'],
description: result.content,
};
}
/**
* Извлекает ID видео из URL
*/
function extractVideoId(url: string, platform: SourcePlatform): string | null {
try {
const urlObj = new URL(url);
switch (platform) {
case 'youtube': {
const v = urlObj.searchParams.get('v');
if (v) return v;
if (urlObj.hostname === 'youtu.be') {
return urlObj.pathname.slice(1);
}
return null;
}
case 'rutube': {
const match = url.match(/rutube\.ru\/video\/([a-f0-9]+)/i);
return match ? match[1] : null;
}
case 'vk': {
const match = url.match(/video(-?\d+_\d+)/i);
return match ? match[1] : null;
}
default:
return null;
}
} catch {
return null;
}
}
/**
* Извлекает данные товара из результата
*/
function extractProductData(
result: { url: string; title: string; content?: string; thumbnail?: string },
platform: SourcePlatform
): ProductData | null {
const priceMatch = result.content?.match(/(\d[\d\s]*)\s*[₽руб\.рублей]/i);
const price = priceMatch ? parseInt(priceMatch[1].replace(/\s/g, ''), 10) : 0;
const marketplace = getMarketplace(platform, result.url);
return {
title: result.title,
price,
currency: '₽',
image: result.thumbnail || '',
url: result.url,
marketplace,
};
}
/**
* Определяет маркетплейс по платформе и URL
*/
function getMarketplace(platform: SourcePlatform, url: string): ProductData['marketplace'] {
if (platform === 'ozon' || /ozon\.ru/i.test(url)) return 'ozon';
if (platform === 'wildberries' || /wildberries\.ru/i.test(url)) return 'wildberries';
if (platform === 'aliexpress' || /aliexpress/i.test(url)) return 'aliexpress';
if (/market\.yandex/i.test(url)) return 'yandex_market';
return 'other';
}
/**
* Извлекает данные профиля из результата
*/
function extractProfileData(
result: { url: string; title: string; content?: string; thumbnail?: string },
platform: SourcePlatform
): ProfileData | null {
const username = extractUsername(result.url, platform);
return {
name: result.title,
username: username || undefined,
avatar: result.thumbnail,
url: result.url,
platform: platform as ProfileData['platform'],
description: result.content,
};
}
/**
* Извлекает username из URL профиля
*/
function extractUsername(url: string, platform: SourcePlatform): string | null {
try {
const urlObj = new URL(url);
switch (platform) {
case 'vk':
return urlObj.pathname.slice(1) || null;
case 'telegram':
return urlObj.pathname.slice(1) || null;
case 'youtube':
if (urlObj.pathname.startsWith('/@')) {
return urlObj.pathname.slice(2);
}
return null;
default:
return urlObj.pathname.split('/').filter(Boolean)[0] || null;
}
} catch {
return null;
}
}
// ============================================
// KNOWLEDGE CARD DETECTION
// ============================================
/**
* Определяет, нужна ли Knowledge Card для запроса
*/
export function shouldGenerateKnowledgeCard(query: string): boolean {
return QUERY_PATTERNS.knowledge_card.some(p => p.test(query));
}
/**
* Определяет тип Knowledge Card по запросу
*/
export function detectKnowledgeCardType(query: string): string | null {
if (/сравни|vs|против|разница/i.test(query)) {
return 'comparison_table';
}
if (/график|динамика|изменение/i.test(query)) {
return 'line_chart';
}
if (/статистика|процент|доля/i.test(query)) {
return 'pie_chart';
}
if (/что такое|определение/i.test(query)) {
return 'definition';
}
if (/хронология|история|когда/i.test(query)) {
return 'timeline';
}
return null;
}
// ============================================
// BATCH CLASSIFICATION
// ============================================
export interface SearchResultBatch {
results: Array<{ url: string; title: string; content?: string; thumbnail?: string }>;
query: string;
}
export interface ClassificationResult {
classified: ClassifiedResult[];
grouped: Record<ContentType, ClassifiedResult[]>;
suggestedWidgets: ContentType[];
knowledgeCardType: string | null;
}
/**
* Классифицирует батч результатов поиска
*/
export function classifySearchResults(batch: SearchResultBatch): ClassificationResult {
const classified = batch.results.map(r => classifySearchResult(r, batch.query));
const grouped = groupResultsByType(classified);
const suggestedWidgets: ContentType[] = [];
if (grouped.product.length >= 2) suggestedWidgets.push('product');
if (grouped.video.length >= 1) suggestedWidgets.push('video');
if (grouped.profile.length >= 1) suggestedWidgets.push('profile');
if (grouped.promo.length >= 1) suggestedWidgets.push('promo');
const knowledgeCardType = shouldGenerateKnowledgeCard(batch.query)
? detectKnowledgeCardType(batch.query)
: null;
return {
classified,
grouped,
suggestedWidgets,
knowledgeCardType,
};
}

View File

@@ -0,0 +1,292 @@
/**
* Crawl4AI REST API client with stealth mode support
*
* Features:
* - Standard crawling for trusted sites
* - Stealth mode with proxy rotation for protected sites
* - Automatic fallback strategies
*/
import { smartCrawl, stealthCrawl, needsStealthMode } from './stealth-crawler.js';
export interface CrawlResult {
url: string;
title: string;
content: string;
success: boolean;
statusCode?: number;
blocked?: boolean;
usedStealth?: boolean;
}
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? '').replace(/\/$/, '');
const TIMEOUT_MS = 45_000;
const MAX_URLS_PER_BATCH = 10;
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0)
return obj.fit_markdown;
}
return '';
}
function stripHtmlToText(html: string): string {
return html
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 50_000);
}
/** Fallback: fetch URL and extract title + text without Crawl4AI */
async function fetchFallback(url: string): Promise<CrawlResult> {
try {
const res = await fetch(url, {
signal: AbortSignal.timeout(10000),
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
},
});
const html = await res.text();
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
const content = stripHtmlToText(body);
return {
url,
title,
content: content || title,
success: res.ok,
statusCode: res.status,
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return {
url,
title: `Error: ${url}`,
content: `Failed to fetch: ${msg}`,
success: false,
};
}
}
/** Standard Crawl4AI request */
async function standardCrawl(url: string): Promise<CrawlResult> {
if (!CRAWL4AI_URL) {
return fetchFallback(url);
}
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: {
type: 'CrawlerRunConfig',
params: {
cache_mode: 'bypass',
page_timeout: 30000,
simulate_user: true,
override_navigator: true,
},
},
}),
signal: AbortSignal.timeout(TIMEOUT_MS),
});
if (!res.ok) {
return fetchFallback(url);
}
const data = (await res.json()) as {
results?: Array<{
success?: boolean;
markdown?: unknown;
metadata?: { title?: string };
status_code?: number;
}>;
};
const first = data.results?.[0];
const markdown = extractCrawl4aiMarkdown(first?.markdown);
if (first?.success && markdown.length > 100) {
return {
url,
title: first.metadata?.title ?? url,
content: markdown,
success: true,
statusCode: first.status_code,
};
}
return fetchFallback(url);
} catch {
return fetchFallback(url);
}
}
/**
* Crawl single URL with automatic stealth detection
*/
export async function crawlUrl(url: string): Promise<CrawlResult> {
if (needsStealthMode(url)) {
const result = await smartCrawl(url);
return {
url: result.url,
title: result.title,
content: result.content,
success: result.success,
statusCode: result.statusCode,
blocked: result.blocked,
usedStealth: true,
};
}
const standard = await standardCrawl(url);
if (!standard.success || standard.content.length < 500) {
const stealth = await stealthCrawl(url, { maxRetries: 2 });
return {
url: stealth.url,
title: stealth.title,
content: stealth.content,
success: stealth.success,
statusCode: stealth.statusCode,
blocked: stealth.blocked,
usedStealth: true,
};
}
return standard;
}
/**
* Crawl multiple URLs with automatic mode selection
*/
export async function crawlUrls(urls: string[]): Promise<CrawlResult[]> {
const list = urls.slice(0, MAX_URLS_PER_BATCH);
if (list.length === 0) return [];
const results: CrawlResult[] = [];
const protectedUrls = list.filter(needsStealthMode);
const normalUrls = list.filter((u) => !needsStealthMode(u));
if (normalUrls.length > 0 && CRAWL4AI_URL) {
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: normalUrls,
crawler_config: {
type: 'CrawlerRunConfig',
params: {
cache_mode: 'bypass',
page_timeout: 30000,
simulate_user: true,
override_navigator: true,
},
},
}),
signal: AbortSignal.timeout(TIMEOUT_MS),
});
if (res.ok) {
const data = (await res.json()) as {
results?: Array<{
url?: string;
success?: boolean;
markdown?: unknown;
metadata?: { title?: string };
status_code?: number;
}>;
};
type ResultItem = {
url?: string;
success?: boolean;
markdown?: unknown;
metadata?: { title?: string };
status_code?: number;
};
const byUrl = new Map<string, ResultItem>();
for (const r of data.results ?? []) {
if (r.url) byUrl.set(r.url, r);
}
for (const url of normalUrls) {
const r = byUrl.get(url);
const markdown = extractCrawl4aiMarkdown(r?.markdown);
if (r?.success && markdown.length > 100) {
results.push({
url,
title: r.metadata?.title ?? url,
content: markdown,
success: true,
statusCode: r.status_code,
});
} else {
const fallback = await fetchFallback(url);
results.push(fallback);
}
}
} else {
const fallbacks = await Promise.all(normalUrls.map(fetchFallback));
results.push(...fallbacks);
}
} catch {
const fallbacks = await Promise.all(normalUrls.map(fetchFallback));
results.push(...fallbacks);
}
} else if (normalUrls.length > 0) {
const fallbacks = await Promise.all(normalUrls.map(fetchFallback));
results.push(...fallbacks);
}
for (const url of protectedUrls) {
const stealthResult = await smartCrawl(url);
results.push({
url: stealthResult.url,
title: stealthResult.title,
content: stealthResult.content,
success: stealthResult.success,
statusCode: stealthResult.statusCode,
blocked: stealthResult.blocked,
usedStealth: true,
});
}
return list.map((url) => results.find((r) => r.url === url)!).filter(Boolean);
}
/**
* Force stealth crawl (use when standard crawl failed)
*/
export async function crawlUrlStealth(url: string): Promise<CrawlResult> {
const result = await stealthCrawl(url, { maxRetries: 3, forceProxy: true });
return {
url: result.url,
title: result.title,
content: result.content,
success: result.success,
statusCode: result.statusCode,
blocked: result.blocked,
usedStealth: true,
};
}
export { needsStealthMode };

View File

@@ -0,0 +1,176 @@
/**
* Синхронный медиа-поиск (images/videos) через SearXNG
* Без LLM — поиск напрямую по запросу пользователя
*/
import { searchSearxng } from './searxng.js';
export interface ImageResult {
url: string;
thumbnailUrl?: string;
title: string;
source: string;
sourceUrl: string;
width?: number;
height?: number;
}
export interface VideoResult {
title: string;
thumbnail: string;
url: string;
embedUrl?: string;
duration: number;
views?: number;
author: string;
authorUrl?: string;
platform: 'rutube' | 'vk' | 'youtube' | 'dzen' | 'other';
publishedAt?: string;
}
interface SearxngImageResult {
title: string;
url: string;
img_src?: string;
thumbnail_src?: string;
content?: string;
source?: string;
engine?: string;
resolution?: string;
}
interface SearxngVideoResult {
title: string;
url: string;
thumbnail?: string;
content?: string;
engine?: string;
duration?: string;
views?: number;
author?: string;
iframe_src?: string;
publishedDate?: string;
}
function detectVideoPlatform(url: string): VideoResult['platform'] {
if (url.includes('youtube.com') || url.includes('youtu.be')) return 'youtube';
if (url.includes('rutube.ru')) return 'rutube';
if (url.includes('vk.com') || url.includes('vk.video')) return 'vk';
if (url.includes('dzen.ru') || url.includes('zen.yandex')) return 'dzen';
return 'other';
}
function parseDuration(durationStr?: string): number {
if (!durationStr) return 0;
const parts = durationStr.split(':').map(Number);
if (parts.length === 3) {
return parts[0] * 3600 + parts[1] * 60 + parts[2];
}
if (parts.length === 2) {
return parts[0] * 60 + parts[1];
}
return parseInt(durationStr, 10) || 0;
}
function extractDomain(url: string): string {
try {
return new URL(url).hostname.replace('www.', '');
} catch {
return url;
}
}
export async function searchImages(query: string, limit = 8): Promise<ImageResult[]> {
try {
const result = await searchSearxng(query, {
categories: ['images'],
});
const images: ImageResult[] = [];
const seenUrls = new Set<string>();
for (const r of (result.results as unknown as SearxngImageResult[])) {
const imgUrl = r.img_src || r.thumbnail_src;
if (!imgUrl || seenUrls.has(imgUrl)) continue;
seenUrls.add(imgUrl);
images.push({
url: imgUrl,
thumbnailUrl: r.thumbnail_src || imgUrl,
title: r.title || '',
source: r.engine || extractDomain(r.url),
sourceUrl: r.url,
});
if (images.length >= limit) break;
}
return images;
} catch (err) {
console.error('[media-search] Image search failed:', err);
return [];
}
}
export async function searchVideos(query: string, limit = 6): Promise<VideoResult[]> {
try {
const result = await searchSearxng(query, {
categories: ['videos'],
});
const videos: VideoResult[] = [];
const seenUrls = new Set<string>();
for (const r of (result.results as unknown as SearxngVideoResult[])) {
if (!r.url || !r.title || seenUrls.has(r.url)) continue;
seenUrls.add(r.url);
const platform = detectVideoPlatform(r.url);
const thumbnail = r.thumbnail || '';
if (!thumbnail) continue;
videos.push({
title: r.title,
thumbnail,
url: r.url,
embedUrl: r.iframe_src,
duration: parseDuration(r.duration),
views: r.views,
author: r.author || extractDomain(r.url),
platform,
publishedAt: r.publishedDate,
});
if (videos.length >= limit) break;
}
return videos;
} catch (err) {
console.error('[media-search] Video search failed:', err);
return [];
}
}
export interface MediaSearchResult {
images: ImageResult[];
videos: VideoResult[];
}
/**
* Параллельный поиск изображений и видео
*/
export async function searchMedia(
query: string,
options?: { maxImages?: number; maxVideos?: number }
): Promise<MediaSearchResult> {
const maxImages = options?.maxImages ?? 8;
const maxVideos = options?.maxVideos ?? 6;
const [images, videos] = await Promise.all([
searchImages(query, maxImages),
searchVideos(query, maxVideos),
]);
return { images, videos };
}

View File

@@ -59,6 +59,6 @@ You must respond in the following JSON format without any extra text, explanatio
</output_format>
`;
export function getClassifierPrompt(locale?: string): string {
return baseClassifierPrompt + getLocaleInstruction(locale);
export function getClassifierPrompt(locale?: string, detectedLanguage?: string): string {
return baseClassifierPrompt + getLocaleInstruction(locale, detectedLanguage);
}

View File

@@ -0,0 +1,29 @@
import { franc } from 'franc';
/** ISO 639-3 (franc) → human-readable language name */
const ISO6393_TO_LANGUAGE: Record<string, string> = {
rus: 'Russian', eng: 'English', deu: 'German', fra: 'French', spa: 'Spanish',
ita: 'Italian', por: 'Portuguese', ukr: 'Ukrainian', pol: 'Polish', cmn: 'Chinese',
jpn: 'Japanese', kor: 'Korean', arb: 'Arabic', tur: 'Turkish', bel: 'Belarusian',
kaz: 'Kazakh', swe: 'Swedish', nob: 'Norwegian', dan: 'Danish', fin: 'Finnish',
ces: 'Czech', slk: 'Slovak', hun: 'Hungarian', ron: 'Romanian', bul: 'Bulgarian',
hrv: 'Croatian', srp: 'Serbian', ell: 'Greek', hin: 'Hindi', tha: 'Thai',
vie: 'Vietnamese', ind: 'Indonesian', zlm: 'Malay', heb: 'Hebrew', pes: 'Persian',
nld: 'Dutch', lit: 'Lithuanian', lav: 'Latvian', est: 'Estonian', slv: 'Slovenian',
};
/** Fallback instruction when language cannot be determined */
export const FALLBACK_RESPONSE_LANGUAGE = `
<response_language>
Always respond in the same language as the user's query. Detect the language from the user's message and reply accordingly. If the user wrote in Russian, respond in Russian; if in English, respond in English; and so on.
</response_language>`;
/**
* Detect language of text using franc. Returns human-readable language name or undefined if undetermined.
*/
export function detectLanguage(text: string): string | undefined {
if (!text || text.trim().length < 2) return undefined;
const code = franc(text.trim(), { minLength: 2 });
if (code === 'und') return undefined;
return ISO6393_TO_LANGUAGE[code] ?? code;
}

View File

@@ -1,19 +1,17 @@
const LOCALE_TO_LANGUAGE: Record<string, string> = {
ru: 'Russian', en: 'English', de: 'German', fr: 'French', es: 'Spanish',
it: 'Italian', pt: 'Portuguese', uk: 'Ukrainian', pl: 'Polish', zh: 'Chinese',
ja: 'Japanese', ko: 'Korean', ar: 'Arabic', tr: 'Turkish', be: 'Belarusian',
kk: 'Kazakh', sv: 'Swedish', nb: 'Norwegian', da: 'Danish', fi: 'Finnish',
cs: 'Czech', sk: 'Slovak', hu: 'Hungarian', ro: 'Romanian', bg: 'Bulgarian',
hr: 'Croatian', sr: 'Serbian', el: 'Greek', hi: 'Hindi', th: 'Thai',
vi: 'Vietnamese', id: 'Indonesian', ms: 'Malay', he: 'Hebrew', fa: 'Persian',
};
export function getLocaleInstruction(locale?: string): string {
if (!locale) return '';
const lang = locale.split('-')[0];
const languageName = LOCALE_TO_LANGUAGE[lang] ?? lang;
return `
/** Принудительно русский язык везде */
const RUSSIAN_RESPONSE_LANGUAGE = `
<response_language>
User's locale is ${locale}. Always format your response in ${languageName}, regardless of the language of the query or search results. Even when the discussed content is in another language, respond in ${languageName}.
Always respond ONLY in Russian. Regardless of the user's query language or search results, format your entire response in Russian.
</response_language>`;
export const FALLBACK_RESPONSE_LANGUAGE = RUSSIAN_RESPONSE_LANGUAGE;
/**
* Build response language instruction. Always Russian.
*/
export function getLocaleInstruction(
_locale?: string,
_detectedLanguage?: string,
): string {
return RUSSIAN_RESPONSE_LANGUAGE;
}

View File

@@ -7,7 +7,7 @@ export const MASTER_SYSTEM_PROMPT = `You are a Master Agent that adapts to any t
- If a task requires multiple steps, use tools iteratively until you have enough information
## Available tools
- web_search: Search the web. Use SEO-friendly keywords. Up to 3 queries per call.
- web_search: Search the web. Use SEO-friendly keywords. Up to 6 queries per call. Use queries in the same language as the user's request.
- scrape_url: Fetch and extract content from a specific URL. Use when user asks about a page.
- calculator: Evaluate math expressions (arithmetic, percentages, sqrt, etc).
- get_stock_quote: Get current stock price for a ticker (AAPL, TSLA, etc).
@@ -19,4 +19,4 @@ export const MASTER_SYSTEM_PROMPT = `You are a Master Agent that adapts to any t
- Use get_stock_quote for stock prices; calculator for math.
- After gathering info, provide a concise, accurate answer. Cite sources when relevant.
- If a tool fails, try alternatives or inform the user.
- Respond in the same language as the user's query unless asked otherwise.`;
- Always respond in Russian.`;

View File

@@ -29,8 +29,9 @@ ${actionDesc}
<response_protocol>
- NEVER output normal text to the user. ONLY call tools.
- Choose the appropriate tools based on the action descriptions provided above.
- Default to web_search when information is missing or stale; keep queries targeted (max 3 per call).
- Default to web_search when information is missing or stale. Use 34 diverse queries covering different aspects of the topic; write queries in the user's language as clear search phrases; avoid near-duplicate queries (same intent in other words).
- Call done when you have gathered enough to answer or performed the required actions.
- CRITICAL: Use ONLY the exact tool names listed above. The tool is called "web_search", NOT "google_search", "search", or any other name. Using wrong names will cause errors.
- Do not invent tools. Do not return JSON.
</response_protocol>
@@ -69,7 +70,8 @@ ${actionDesc}
- NEVER output normal text to the user. ONLY call tools.
- Start with __reasoning_preamble and call __reasoning_preamble before every tool call (including done).
- Choose tools based on the action descriptions provided above.
- Default to web_search when information is missing or stale; keep queries targeted (max 3 per call).
- Default to web_search when information is missing or stale. Use 34 diverse queries covering different aspects; user's language, clear search phrases; avoid near-duplicate queries.
- CRITICAL: Use ONLY the exact tool names listed above. The tool is called "web_search", NOT "google_search", "search", or any other name. Using wrong names will cause errors.
- Use at most 6 tool calls total. Do not invent tools. Do not return JSON.
</response_protocol>
@@ -114,6 +116,8 @@ For any topic, consider searching: Core definition/overview, Features/capabiliti
- NEVER output normal text to the user. ONLY call tools.
- Follow an iterative loop: __reasoning_preamble → tool call → __reasoning_preamble → tool call → ... → __reasoning_preamble → done.
- Each __reasoning_preamble should reflect on previous results (if any) and state the next research step.
- For web_search: use diverse queries (different angles, user's language, clear search phrases); avoid near-duplicate queries.
- CRITICAL: Use ONLY the exact tool names listed above. The tool is called "web_search", NOT "google_search", "search", or any other name. Using wrong names will cause errors.
- Aim for 4-7 information-gathering calls covering different angles.
- Call done only after comprehensive, multi-angle research is complete.
- Do not invent tools. Do not return JSON.
@@ -123,6 +127,23 @@ ${fileDesc ? `<user_uploaded_files>\n${fileDesc}\n</user_uploaded_files>` : ''}
`;
}
const ARTICLE_SUMMARY_INSTRUCTION = `
<article_summary_mode>
The user requested an article summary (Summary: <url>). This is a Perplexity-style multi-source digest request.
CRITICAL STEPS (follow in order):
1. FIRST call scrape_url with the EXACT URL from the user message. This gets the primary article content.
2. THEN call web_search with 2-3 queries about the article's TOPIC (not the URL!). Extract the topic from the article title provided in the message (after "Article title:"). Use the same language as the article. Examples for Russian: "реформа образования наука 2026", "научные исследования финансирование". Examples for English: "AI regulation EU 2026", "tech layoffs impact".
3. If the article title mentions specific people, organizations, or events — search for those specifically.
IMPORTANT:
- You MUST call scrape_url with the original URL — this is the PRIMARY source.
- Search queries should be about the TOPIC, not the URL or article ID.
- Never search for URL fragments like "kommersant 8462696" — search for the actual topic.
- Gather as many relevant sources as possible for a comprehensive multi-source summary.
</article_summary_mode>
`;
export function getResearcherPrompt(
actionDesc: string,
mode: Mode,
@@ -130,6 +151,8 @@ export function getResearcherPrompt(
maxIteration: number,
fileDesc: string,
locale?: string,
detectedLanguage?: string,
isArticleSummary?: boolean,
): string {
let prompt: string;
switch (mode) {
@@ -145,5 +168,6 @@ export function getResearcherPrompt(
default:
prompt = getSpeedPrompt(actionDesc, i, maxIteration, fileDesc);
}
return prompt + getLocaleInstruction(locale);
if (isArticleSummary) prompt += ARTICLE_SUMMARY_INSTRUCTION;
return prompt + getLocaleInstruction(locale, detectedLanguage);
}

View File

@@ -26,6 +26,8 @@ export function getWriterPrompt(
answerMode?: AnswerMode,
responsePrefs?: ResponsePrefs,
learningMode?: boolean,
detectedLanguage?: string,
isArticleSummary?: boolean,
): string {
const memoryBlock = memoryContext?.trim()
? `\n### User memory (personalization)\nUse these stored facts/preferences to personalize when relevant. Do NOT cite as source.\n${memoryContext}\n`
@@ -58,6 +60,26 @@ export function getWriterPrompt(
? `\n### Step-by-step Learning mode\nExplain your reasoning step-by-step. Break down complex concepts. Show the logical flow. Use numbered steps or "First... Then... Finally" structure.\n`
: '';
const articleSummaryBlock = isArticleSummary
? `\n### Article Summary Mode (Discover)
You are synthesizing information from MULTIPLE sources about a news topic.
This is a Perplexity-style multi-source digest. Follow these rules strictly:
- **EVERY sentence** MUST have at least one citation [N] referring to a search_results source.
- Use **ALL** available sources, not just the first one. Distribute citations across all sources.
- Structure the summary as:
1. **Introduction** (2-3 sentences, overview of the topic)
2. **Key Details** (main facts, events, numbers from multiple sources)
3. **Analysis** (expert opinions, implications, different viewpoints)
4. **Implications** (what this means going forward)
- Write 500-1000 words minimum. Be comprehensive.
- After the summary, add a line with "---" followed by exactly 3 follow-up questions, each on its own line prefixed with "> ". These questions should help the user explore the topic deeper.
- Write in Russian (unless the user's locale indicates otherwise).
- Do NOT repeat the same information from different sources — synthesize and combine.
- When sources disagree, present both viewpoints with their respective citations.
`
: '';
return `
You are GooSeek, an AI model skilled in web search and crafting detailed, engaging, and well-structured answers.
@@ -74,10 +96,11 @@ Your task is to provide answers that are:
### Special Instructions
- The context contains two sections: \`search_results\` (web search) and \`widgets_result\` (calculations, weather, stocks). If widgets_result has the answer, USE IT.
- Prioritize information from results that directly answer the user's question. Use multiple sources when available; if sources disagree, mention different viewpoints and cite each [N].
- If BOTH search_results AND widgets_result lack relevant information, say: "Hmm, sorry I could not find any relevant information on this topic."
${mode === 'quality' ? "- QUALITY MODE: Generate very deep, detailed responses. At least 2000 words, cover everything like a research report." : ''}
${mode === 'quality' ? "- QUALITY MODE: Generate very deep, detailed responses. At least 2000 words, cover everything like a research report. Use as many of the provided results as relevant." : ''}
${verticalBlock}${prefsBlock}${learningBlock}
${articleSummaryBlock}${verticalBlock}${prefsBlock}${learningBlock}
### User instructions
${systemInstructions}
${memoryBlock}
@@ -87,6 +110,6 @@ ${context}
</context>
Current date & time (UTC): ${new Date().toISOString()}.
${getLocaleInstruction(locale)}
${getLocaleInstruction(locale, detectedLanguage)}
`;
}

View File

@@ -0,0 +1,324 @@
/**
* Proxy Manager — ротация Tor circuits + free proxy lists
* Опенсорс, без платных API
*/
interface ProxyConfig {
type: 'tor' | 'http' | 'socks5';
host: string;
port: number;
username?: string;
password?: string;
}
interface ProxyHealth {
proxy: ProxyConfig;
lastCheck: number;
healthy: boolean;
latencyMs: number;
failCount: number;
}
const TOR_PROXY: ProxyConfig = {
type: 'socks5',
host: process.env.TOR_PROXY_HOST ?? 'tor-proxy',
port: parseInt(process.env.TOR_PROXY_PORT ?? '9050', 10),
};
const TOR_CONTROL_PORT = parseInt(process.env.TOR_CONTROL_PORT ?? '9051', 10);
const TOR_CONTROL_PASSWORD = process.env.TOR_CONTROL_PASSWORD ?? 'gooseek_tor_control';
const FREE_PROXY_SOURCES = [
'https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt',
'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt',
'https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt',
];
class ProxyManager {
private freeProxies: ProxyConfig[] = [];
private proxyHealth: Map<string, ProxyHealth> = new Map();
private currentProxyIndex = 0;
private torCircuitId = 0;
private lastTorRotation = 0;
private lastProxyFetch = 0;
private proxyKey(p: ProxyConfig): string {
return `${p.type}://${p.host}:${p.port}`;
}
/**
* Rotate Tor circuit via control port (NEWNYM signal)
* Даёт новый exit node = новый IP
*/
async rotateTorCircuit(): Promise<boolean> {
const now = Date.now();
if (now - this.lastTorRotation < 10000) {
return true;
}
try {
const net = await import('net');
return new Promise((resolve) => {
const socket = net.createConnection(TOR_CONTROL_PORT, TOR_PROXY.host);
let authenticated = false;
socket.setTimeout(5000);
socket.on('connect', () => {
socket.write(`AUTHENTICATE "${TOR_CONTROL_PASSWORD}"\r\n`);
});
socket.on('data', (data) => {
const response = data.toString();
if (response.includes('250 OK') && !authenticated) {
authenticated = true;
socket.write('SIGNAL NEWNYM\r\n');
} else if (response.includes('250 OK') && authenticated) {
this.torCircuitId++;
this.lastTorRotation = now;
socket.end();
resolve(true);
} else if (response.includes('515') || response.includes('551')) {
socket.end();
resolve(false);
}
});
socket.on('error', () => {
resolve(false);
});
socket.on('timeout', () => {
socket.destroy();
resolve(false);
});
});
} catch {
return false;
}
}
/**
* Fetch free proxies from public GitHub lists
*/
async fetchFreeProxies(): Promise<void> {
const now = Date.now();
if (now - this.lastProxyFetch < 3600000 && this.freeProxies.length > 0) {
return;
}
const newProxies: ProxyConfig[] = [];
for (const source of FREE_PROXY_SOURCES) {
try {
const res = await fetch(source, {
signal: AbortSignal.timeout(10000),
headers: { 'User-Agent': 'GooSeek-ProxyFetcher/1.0' },
});
if (!res.ok) continue;
const text = await res.text();
const lines = text.split('\n').filter((l) => l.trim());
for (const line of lines.slice(0, 50)) {
const match = line.trim().match(/^(\d+\.\d+\.\d+\.\d+):(\d+)$/);
if (match) {
newProxies.push({
type: 'http',
host: match[1],
port: parseInt(match[2], 10),
});
}
}
} catch {
continue;
}
}
if (newProxies.length > 0) {
this.freeProxies = this.shuffleArray(newProxies).slice(0, 100);
this.lastProxyFetch = now;
}
}
/**
* Check if Tor proxy is available
*/
async isTorAvailable(): Promise<boolean> {
try {
const testUrl = 'https://check.torproject.org/api/ip';
const res = await this.fetchWithProxy(testUrl, TOR_PROXY, 10000);
return res.ok;
} catch {
return false;
}
}
/**
* Fetch URL through a proxy (SOCKS5 or HTTP)
*/
private async fetchWithProxy(
url: string,
proxy: ProxyConfig,
timeoutMs: number
): Promise<Response> {
const { SocksProxyAgent } = await import('socks-proxy-agent');
const { HttpsProxyAgent } = await import('https-proxy-agent');
let agent;
if (proxy.type === 'socks5') {
const proxyUrl = proxy.username
? `socks5://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`
: `socks5://${proxy.host}:${proxy.port}`;
agent = new SocksProxyAgent(proxyUrl);
} else {
const proxyUrl = proxy.username
? `http://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`
: `http://${proxy.host}:${proxy.port}`;
agent = new HttpsProxyAgent(proxyUrl);
}
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await fetch(url, {
// @ts-expect-error Node.js fetch supports agent
agent,
signal: controller.signal,
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
},
});
clearTimeout(timeoutId);
return res;
} catch (err) {
clearTimeout(timeoutId);
throw err;
}
}
/**
* Get next working proxy with health check
*/
async getNextProxy(): Promise<ProxyConfig | null> {
const torAvailable = await this.isTorAvailable();
if (torAvailable) {
await this.rotateTorCircuit();
return TOR_PROXY;
}
await this.fetchFreeProxies();
if (this.freeProxies.length === 0) {
return null;
}
for (let i = 0; i < Math.min(10, this.freeProxies.length); i++) {
const proxy = this.freeProxies[this.currentProxyIndex % this.freeProxies.length];
this.currentProxyIndex++;
const health = this.proxyHealth.get(this.proxyKey(proxy));
if (health && !health.healthy && Date.now() - health.lastCheck < 300000) {
continue;
}
const isHealthy = await this.checkProxyHealth(proxy);
if (isHealthy) {
return proxy;
}
}
return null;
}
/**
* Check proxy health
*/
private async checkProxyHealth(proxy: ProxyConfig): Promise<boolean> {
const key = this.proxyKey(proxy);
const start = Date.now();
try {
const res = await this.fetchWithProxy('https://httpbin.org/ip', proxy, 10000);
const latencyMs = Date.now() - start;
this.proxyHealth.set(key, {
proxy,
lastCheck: Date.now(),
healthy: res.ok,
latencyMs,
failCount: 0,
});
return res.ok;
} catch {
const existing = this.proxyHealth.get(key);
this.proxyHealth.set(key, {
proxy,
lastCheck: Date.now(),
healthy: false,
latencyMs: -1,
failCount: (existing?.failCount ?? 0) + 1,
});
return false;
}
}
/**
* Get proxy URL string for Crawl4AI
*/
async getProxyUrl(): Promise<string | null> {
const proxy = await this.getNextProxy();
if (!proxy) return null;
if (proxy.type === 'socks5') {
return `socks5://${proxy.host}:${proxy.port}`;
}
return `http://${proxy.host}:${proxy.port}`;
}
/**
* Get Tor SOCKS5 proxy URL (always returns Tor, use when Tor is required)
*/
getTorProxyUrl(): string {
return `socks5://${TOR_PROXY.host}:${TOR_PROXY.port}`;
}
private shuffleArray<T>(array: T[]): T[] {
const result = [...array];
for (let i = result.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[result[i], result[j]] = [result[j], result[i]];
}
return result;
}
/**
* Get current Tor circuit ID (for logging)
*/
getTorCircuitId(): number {
return this.torCircuitId;
}
/**
* Get stats
*/
getStats(): {
torCircuitId: number;
freeProxiesCount: number;
healthyProxies: number;
} {
const healthyCount = Array.from(this.proxyHealth.values()).filter((h) => h.healthy).length;
return {
torCircuitId: this.torCircuitId,
freeProxiesCount: this.freeProxies.length,
healthyProxies: healthyCount,
};
}
}
export const proxyManager = new ProxyManager();
export type { ProxyConfig };

View File

@@ -0,0 +1,314 @@
/**
* Adaptive Reranker Module
*
* Provides multiple reranking strategies for search results:
* - BM25: Fast, local, no API calls - standard IR algorithm used by Elasticsearch/Solr
* - LLM: Uses the configured LLM to score relevance (slower but smarter)
* - Jina: Uses Jina Reranker API (requires JINA_API_KEY env var)
*
* BM25 is NOT hardcoded - it's a mathematical formula that adapts to:
* - Document length normalization
* - Term frequency saturation
* - Inverse document frequency
*/
export interface RerankableItem {
content: string;
title?: string;
url?: string;
metadata?: Record<string, unknown>;
}
export interface RankedItem extends RerankableItem {
score: number;
}
export type RerankStrategy = 'bm25' | 'llm' | 'jina';
interface BM25Config {
k1: number; // term frequency saturation, typically 1.2-2.0
b: number; // length normalization, typically 0.75
}
const DEFAULT_BM25_CONFIG: BM25Config = {
k1: 1.5,
b: 0.75,
};
function tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\p{L}\p{N}\s]/gu, ' ')
.split(/\s+/)
.filter((t) => t.length >= 2);
}
function computeIdf(docs: string[][], term: string): number {
const N = docs.length;
const df = docs.filter((doc) => doc.includes(term)).length;
if (df === 0) return 0;
return Math.log((N - df + 0.5) / (df + 0.5) + 1);
}
function bm25Score(
query: string[],
doc: string[],
avgDocLen: number,
idfCache: Map<string, number>,
config: BM25Config,
): number {
const { k1, b } = config;
const docLen = doc.length;
let score = 0;
const termFreq = new Map<string, number>();
for (const term of doc) {
termFreq.set(term, (termFreq.get(term) ?? 0) + 1);
}
for (const term of query) {
const idf = idfCache.get(term) ?? 0;
const tf = termFreq.get(term) ?? 0;
if (tf === 0) continue;
const numerator = tf * (k1 + 1);
const denominator = tf + k1 * (1 - b + b * (docLen / avgDocLen));
score += idf * (numerator / denominator);
}
return score;
}
export function rerankBM25(
items: RerankableItem[],
query: string,
topK?: number,
config: BM25Config = DEFAULT_BM25_CONFIG,
): RankedItem[] {
if (items.length === 0) return [];
const queryTokens = tokenize(query);
if (queryTokens.length === 0) {
return items.slice(0, topK).map((item) => ({ ...item, score: 0 }));
}
const docs = items.map((item) => {
const text = `${item.title ?? ''} ${item.content}`;
return tokenize(text);
});
const avgDocLen = docs.reduce((sum, d) => sum + d.length, 0) / docs.length || 1;
const allTerms = new Set([...queryTokens]);
const idfCache = new Map<string, number>();
for (const term of allTerms) {
idfCache.set(term, computeIdf(docs, term));
}
const scored: RankedItem[] = items.map((item, i) => ({
...item,
score: bm25Score(queryTokens, docs[i], avgDocLen, idfCache, config),
}));
scored.sort((a, b) => b.score - a.score);
return topK ? scored.slice(0, topK) : scored;
}
export async function rerankWithLLM(
items: RerankableItem[],
query: string,
topK: number,
llmGenerateFn: (prompt: string) => Promise<string>,
): Promise<RankedItem[]> {
if (items.length === 0) return [];
if (items.length <= topK) {
return items.map((item, i) => ({ ...item, score: items.length - i }));
}
const preRanked = rerankBM25(items, query, Math.min(items.length, 50));
const listing = preRanked.slice(0, 30).map((item, i) => {
const title = item.title ?? 'No title';
const snippet = item.content.slice(0, 150).replace(/\n/g, ' ');
return `[${i}] ${title}\n${snippet}`;
}).join('\n\n');
const prompt = `You are a search result relevance evaluator.
Query: "${query}"
Search results:
${listing}
Task: Select the ${topK} most relevant results for the query. Consider:
- Direct relevance to the query topic
- Information quality and credibility
- Diversity of perspectives
Return ONLY a JSON array of indices, e.g.: [0, 2, 5, 7, 1]
No explanation, just the array.`;
try {
const response = await llmGenerateFn(prompt);
const cleaned = response.replace(/```json?\s*/gi, '').replace(/```/g, '').trim();
const indices = JSON.parse(cleaned) as number[];
if (Array.isArray(indices)) {
const validIndices = indices.filter((i) => typeof i === 'number' && i >= 0 && i < preRanked.length);
const selected: RankedItem[] = validIndices.slice(0, topK).map((idx, rank) => ({
...preRanked[idx],
score: topK - rank,
}));
if (selected.length >= topK * 0.5) {
return selected;
}
}
} catch {
// LLM parsing failed, fall back to BM25
}
return preRanked.slice(0, topK);
}
const JINA_API_URL = 'https://api.jina.ai/v1/rerank';
export async function rerankWithJina(
items: RerankableItem[],
query: string,
topK: number,
apiKey?: string,
): Promise<RankedItem[]> {
const key = apiKey ?? process.env.JINA_API_KEY;
if (!key) {
return rerankBM25(items, query, topK);
}
if (items.length === 0) return [];
if (items.length <= topK) {
return items.map((item, i) => ({ ...item, score: 1 - i * 0.01 }));
}
const preRanked = rerankBM25(items, query, Math.min(items.length, 100));
const documents = preRanked.slice(0, 50).map((item) => ({
text: `${item.title ?? ''}\n${item.content.slice(0, 500)}`,
}));
try {
const response = await fetch(JINA_API_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${key}`,
},
body: JSON.stringify({
model: 'jina-reranker-v2-base-multilingual',
query,
documents,
top_n: topK,
}),
signal: AbortSignal.timeout(10000),
});
if (!response.ok) {
throw new Error(`Jina API error: ${response.status}`);
}
const data = (await response.json()) as {
results?: { index: number; relevance_score: number }[];
};
if (data.results && Array.isArray(data.results)) {
return data.results.map((r) => ({
...preRanked[r.index],
score: r.relevance_score,
}));
}
} catch {
// Jina API failed, fall back to BM25
}
return preRanked.slice(0, topK);
}
export interface RerankOptions {
strategy?: RerankStrategy;
topK?: number;
llmGenerateFn?: (prompt: string) => Promise<string>;
jinaApiKey?: string;
}
export async function rerank(
items: RerankableItem[],
query: string,
options: RerankOptions = {},
): Promise<RankedItem[]> {
const {
strategy = 'bm25',
topK,
llmGenerateFn,
jinaApiKey,
} = options;
switch (strategy) {
case 'llm':
if (llmGenerateFn && topK) {
return rerankWithLLM(items, query, topK, llmGenerateFn);
}
return rerankBM25(items, query, topK);
case 'jina':
if (topK) {
return rerankWithJina(items, query, topK, jinaApiKey);
}
return rerankBM25(items, query, topK);
case 'bm25':
default:
return rerankBM25(items, query, topK);
}
}
export function computeAdaptiveTopK(
totalResults: number,
queryComplexity: 'simple' | 'medium' | 'complex',
mode: 'speed' | 'balanced' | 'quality',
): number {
const baseMultipliers: Record<string, number> = {
speed: 0.05,
balanced: 0.1,
quality: 0.15,
};
const complexityMultipliers: Record<string, number> = {
simple: 0.8,
medium: 1.0,
complex: 1.3,
};
const base = baseMultipliers[mode] ?? 0.1;
const complexity = complexityMultipliers[queryComplexity] ?? 1.0;
const minK = mode === 'speed' ? 5 : mode === 'balanced' ? 8 : 12;
const maxK = mode === 'speed' ? 10 : mode === 'balanced' ? 20 : 30;
const adaptiveK = Math.round(totalResults * base * complexity);
return Math.min(maxK, Math.max(minK, adaptiveK));
}
export function estimateQueryComplexity(query: string): 'simple' | 'medium' | 'complex' {
const tokens = tokenize(query);
const hasComparison = /сравн|vs\.?|или|лучш|vs|compare|better|best/i.test(query);
const hasMultipleEntities = tokens.length > 5;
const hasSpecificRequest = /топ|список|найди|обзор|top|list|find|review/i.test(query);
if (hasComparison || (hasMultipleEntities && hasSpecificRequest)) {
return 'complex';
}
if (tokens.length > 3 || hasSpecificRequest) {
return 'medium';
}
return 'simple';
}

View File

@@ -9,11 +9,53 @@ export interface SearxngSearchResult {
title: string;
url: string;
content?: string;
thumbnail?: string;
img_src?: string;
thumbnail_src?: string;
iframe_src?: string;
author?: string;
publishedDate?: string;
engine?: string;
category?: string;
score?: number;
price?: string;
currency?: string;
duration?: number;
views?: number;
}
interface SearxngSearchOptions {
export interface SearxngSearchOptions {
engines?: string[];
categories?: string[];
pageno?: number;
}
export type ContentCategory = 'product' | 'video' | 'profile' | 'promo' | 'image' | 'article';
export function categorizeResult(result: SearxngSearchResult): ContentCategory {
const url = result.url.toLowerCase();
if (/ozon\.ru\/product|wildberries\.ru\/catalog\/\d|aliexpress\.(ru|com)\/item|market\.yandex/.test(url)) {
return 'product';
}
if (/rutube\.ru\/video|vk\.com\/video|vk\.com\/clip|youtube\.com\/watch|youtu\.be|dzen\.ru\/video/.test(url)) {
return 'video';
}
if (result.iframe_src || result.category === 'videos') {
return 'video';
}
if (/vk\.com\/(?!video|clip)[a-zA-Z0-9_.]+$|t\.me\/[a-zA-Z0-9_]+$/.test(url)) {
return 'profile';
}
if (result.img_src && result.category === 'images') {
return 'image';
}
return 'article';
}
function buildSearchUrl(baseUrl: string, query: string, opts?: SearxngSearchOptions): string {
@@ -22,6 +64,7 @@ function buildSearchUrl(baseUrl: string, query: string, opts?: SearxngSearchOpti
params.append('q', query);
if (opts?.engines) params.append('engines', opts.engines.join(','));
if (opts?.categories) params.append('categories', Array.isArray(opts.categories) ? opts.categories.join(',') : opts.categories);
if (opts?.pageno != null) params.append('pageno', String(opts.pageno));
const base = baseUrl.trim().replace(/\/$/, '');
const prefix = /^https?:\/\//i.test(base) ? '' : 'http://';
return `${prefix}${base}/search?${params.toString()}`;
@@ -35,8 +78,10 @@ export async function searchSearxng(
const params = new URLSearchParams();
params.set('q', query);
if (opts?.engines) params.set('engines', opts.engines.join(','));
if (opts?.categories) params.set('categories', Array.isArray(opts.categories) ? opts.categories.join(',') : opts.categories);
if (opts?.pageno != null) params.set('pageno', String(opts.pageno));
const url = `${SEARCH_SVC_URL.replace(/\/$/, '')}/api/v1/search?${params.toString()}`;
const res = await fetch(url, { signal: AbortSignal.timeout(15000) });
const res = await fetch(url, { signal: AbortSignal.timeout(10000) });
if (!res.ok) throw new Error(`Search HTTP ${res.status}`);
return res.json() as Promise<{ results: SearxngSearchResult[]; suggestions?: string[] }>;
}
@@ -56,7 +101,7 @@ export async function searchSearxng(
for (const baseUrl of candidates) {
try {
const url = buildSearchUrl(baseUrl, query, opts);
const res = await fetch(url, { signal: AbortSignal.timeout(15000) });
const res = await fetch(url, { signal: AbortSignal.timeout(10000) });
const data = (await res.json()) as { results?: SearxngSearchResult[]; suggestions?: string[] };
return { results: data.results ?? [], suggestions: data.suggestions };
} catch (err) {

View File

@@ -0,0 +1,394 @@
/**
* Stealth Crawler — обход antibot защит
*
* Возможности:
* - Browser fingerprint randomization
* - Proxy rotation (Tor + free proxies)
* - User-Agent rotation
* - Human-like behavior simulation
* - Adaptive retry strategies
*/
import { proxyManager } from './proxy-manager.js';
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
export interface StealthCrawlResult {
url: string;
title: string;
content: string;
success: boolean;
statusCode?: number;
usedProxy?: string;
blocked: boolean;
retryCount: number;
}
const USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
];
const VIEWPORT_SIZES = [
{ width: 1920, height: 1080 },
{ width: 1366, height: 768 },
{ width: 1536, height: 864 },
{ width: 1440, height: 900 },
{ width: 1280, height: 720 },
];
const ANTIBOT_INDICATORS = [
'captcha',
'antibot',
'challenge',
'почти готово',
'проверка',
'access denied',
'доступ ограничен',
'blocked',
'forbidden',
'robot',
'bot detection',
'cloudflare',
'ddos protection',
'rate limit',
'too many requests',
];
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0)
return obj.fit_markdown;
}
return '';
}
function randomElement<T>(arr: T[]): T {
return arr[Math.floor(Math.random() * arr.length)];
}
function isBlockedResponse(title: string, content: string, statusCode?: number): boolean {
const lowerTitle = (title || '').toLowerCase();
const lowerContent = (content || '').toLowerCase().slice(0, 2000);
if (statusCode === 403 || statusCode === 429 || statusCode === 503) {
return true;
}
return ANTIBOT_INDICATORS.some(
(indicator) => lowerTitle.includes(indicator) || lowerContent.includes(indicator)
);
}
function generateFingerprint(): {
userAgent: string;
viewport: { width: number; height: number };
locale: string;
timezone: string;
platform: string;
} {
const userAgent = randomElement(USER_AGENTS);
const viewport = randomElement(VIEWPORT_SIZES);
const locales = ['ru-RU', 'en-US', 'en-GB'];
const timezones = ['Europe/Moscow', 'Europe/London', 'America/New_York'];
const platforms = ['Win32', 'MacIntel', 'Linux x86_64'];
return {
userAgent,
viewport,
locale: randomElement(locales),
timezone: randomElement(timezones),
platform: randomElement(platforms),
};
}
interface CrawlConfig {
useProxy: boolean;
useStealth: boolean;
simulateUser: boolean;
waitForSelector?: string;
extraWaitMs?: number;
jsCode?: string;
}
async function crawlWithConfig(
url: string,
config: CrawlConfig,
fingerprint: ReturnType<typeof generateFingerprint>,
proxyUrl?: string | null
): Promise<{
success: boolean;
title: string;
content: string;
statusCode?: number;
}> {
const crawlerConfig: Record<string, unknown> = {
type: 'CrawlerRunConfig',
params: {
cache_mode: 'bypass',
page_timeout: 45000,
wait_until: 'domcontentloaded',
delay_before_return_html: config.extraWaitMs ? config.extraWaitMs / 1000 : 2,
viewport_width: fingerprint.viewport.width,
viewport_height: fingerprint.viewport.height,
user_agent: fingerprint.userAgent,
locale_code: fingerprint.locale,
timezone_id: fingerprint.timezone,
},
};
if (config.useStealth) {
Object.assign(crawlerConfig.params as Record<string, unknown>, {
simulate_user: true,
override_navigator: true,
magic: true,
});
}
if (config.simulateUser) {
Object.assign(crawlerConfig.params as Record<string, unknown>, {
scan_full_page: true,
scroll_delay: 0.5,
});
}
if (config.waitForSelector) {
Object.assign(crawlerConfig.params as Record<string, unknown>, {
wait_for: config.waitForSelector,
});
}
if (config.jsCode) {
Object.assign(crawlerConfig.params as Record<string, unknown>, {
js_code: config.jsCode,
});
}
if (proxyUrl) {
Object.assign(crawlerConfig.params as Record<string, unknown>, {
proxy: proxyUrl,
});
}
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: crawlerConfig,
}),
signal: AbortSignal.timeout(60000),
});
if (!res.ok) {
return { success: false, title: '', content: '', statusCode: res.status };
}
const data = (await res.json()) as {
results?: Array<{
success?: boolean;
markdown?: unknown;
metadata?: { title?: string };
status_code?: number;
}>;
};
const first = data.results?.[0];
const markdown = extractCrawl4aiMarkdown(first?.markdown);
const title = first?.metadata?.title ?? '';
return {
success: first?.success ?? false,
title,
content: markdown,
statusCode: first?.status_code,
};
} catch {
return { success: false, title: '', content: '' };
}
}
/**
* Stealth crawl with automatic retry and proxy rotation
*/
export async function stealthCrawl(
url: string,
options: {
maxRetries?: number;
forceProxy?: boolean;
requireContent?: boolean;
} = {}
): Promise<StealthCrawlResult> {
const { maxRetries = 3, forceProxy = false, requireContent = true } = options;
let retryCount = 0;
let lastResult: StealthCrawlResult | null = null;
const strategies: CrawlConfig[] = [
{ useProxy: false, useStealth: true, simulateUser: false },
{ useProxy: false, useStealth: true, simulateUser: true, extraWaitMs: 3000 },
{ useProxy: true, useStealth: true, simulateUser: true, extraWaitMs: 5000 },
];
if (forceProxy) {
strategies.unshift({
useProxy: true,
useStealth: true,
simulateUser: true,
extraWaitMs: 3000,
});
}
for (const strategy of strategies) {
if (retryCount >= maxRetries) break;
const fingerprint = generateFingerprint();
let proxyUrl: string | null = null;
if (strategy.useProxy) {
proxyUrl = await proxyManager.getProxyUrl();
}
const result = await crawlWithConfig(url, strategy, fingerprint, proxyUrl);
retryCount++;
const blocked = isBlockedResponse(result.title, result.content, result.statusCode);
const hasContent = result.content.length > 500;
lastResult = {
url,
title: result.title,
content: result.content,
success: result.success && !blocked && (!requireContent || hasContent),
statusCode: result.statusCode,
usedProxy: proxyUrl ?? undefined,
blocked,
retryCount,
};
if (lastResult.success) {
return lastResult;
}
if (blocked && strategy.useProxy) {
await proxyManager.rotateTorCircuit();
}
await new Promise((resolve) => setTimeout(resolve, 1000 + Math.random() * 2000));
}
return (
lastResult ?? {
url,
title: '',
content: '',
success: false,
blocked: true,
retryCount,
}
);
}
/**
* Batch stealth crawl with concurrency control
*/
export async function stealthCrawlBatch(
urls: string[],
options: {
concurrency?: number;
maxRetries?: number;
forceProxy?: boolean;
} = {}
): Promise<StealthCrawlResult[]> {
const { concurrency = 2, maxRetries = 2, forceProxy = false } = options;
const results: StealthCrawlResult[] = [];
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.all(
batch.map((url) => stealthCrawl(url, { maxRetries, forceProxy }))
);
results.push(...batchResults);
if (i + concurrency < urls.length) {
await new Promise((resolve) => setTimeout(resolve, 2000 + Math.random() * 3000));
}
}
return results;
}
/**
* Simple crawl without stealth (for trusted sites)
*/
export async function simpleCrawl(url: string): Promise<StealthCrawlResult> {
const fingerprint = generateFingerprint();
const result = await crawlWithConfig(
url,
{ useProxy: false, useStealth: false, simulateUser: false },
fingerprint
);
return {
url,
title: result.title,
content: result.content,
success: result.success && result.content.length > 100,
statusCode: result.statusCode,
blocked: isBlockedResponse(result.title, result.content, result.statusCode),
retryCount: 1,
};
}
/**
* Check if URL likely needs stealth mode
*/
export function needsStealthMode(url: string): boolean {
const protectedDomains = [
'wildberries.ru',
'ozon.ru',
'dns-shop.ru',
'eldorado.ru',
'lamoda.ru',
'sportmaster.ru',
'avito.ru',
'sberbank.ru',
'amazon.com',
'walmart.com',
'ebay.com',
];
try {
const hostname = new URL(url).hostname.toLowerCase();
return protectedDomains.some((d) => hostname.includes(d));
} catch {
return false;
}
}
/**
* Smart crawl — auto-detects if stealth is needed
*/
export async function smartCrawl(url: string): Promise<StealthCrawlResult> {
if (needsStealthMode(url)) {
return stealthCrawl(url, { maxRetries: 3, forceProxy: true });
}
const simple = await simpleCrawl(url);
if (simple.blocked || !simple.success) {
return stealthCrawl(url, { maxRetries: 2 });
}
return simple;
}

View File

@@ -1,6 +1,19 @@
import z from 'zod';
import type { ToolDef } from './types.js';
const CRAWL4AI_URL = (process.env.CRAWL4AI_URL ?? 'http://crawl4ai:11235').replace(/\/$/, '');
function extractCrawl4aiMarkdown(md: unknown): string {
if (typeof md === 'string') return md;
if (md && typeof md === 'object') {
const obj = md as Record<string, unknown>;
if (typeof obj.raw_markdown === 'string') return obj.raw_markdown;
if (typeof obj.markdown_with_citations === 'string') return obj.markdown_with_citations;
if (typeof obj.fit_markdown === 'string' && (obj.fit_markdown as string).length > 0) return obj.fit_markdown;
}
return '';
}
const schema = z.object({
url: z.string().url().describe('URL to fetch and extract text from'),
});
@@ -11,21 +24,56 @@ export const scrapeUrlTool: ToolDef = {
schema,
execute: async (params, _ctx) => {
const { url } = schema.parse(params);
const res = await fetch(url, {
signal: AbortSignal.timeout(10000),
headers: { 'User-Agent': 'GooSeek-MasterAgent/1.0' },
});
const html = await res.text();
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
const text = body
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 8000);
return JSON.stringify({ title, url, content: text }, null, 2);
let title = url;
let content = '';
try {
const res = await fetch(`${CRAWL4AI_URL}/crawl`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
urls: [url],
crawler_config: {
type: 'CrawlerRunConfig',
params: { cache_mode: 'bypass', page_timeout: 15000 },
},
}),
signal: AbortSignal.timeout(25000),
});
if (res.ok) {
const data = (await res.json()) as {
results?: { markdown?: unknown; metadata?: { title?: string }; success?: boolean }[];
};
const first = data.results?.[0];
const md = extractCrawl4aiMarkdown(first?.markdown);
if (first?.success && md) {
title = first.metadata?.title ?? url;
content = md.slice(0, 8000);
}
}
} catch {
// fallback below
}
if (!content) {
const res = await fetch(url, {
signal: AbortSignal.timeout(10000),
headers: { 'User-Agent': 'GooSeek-MasterAgent/1.0' },
});
const html = await res.text();
title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1]?.trim() ?? url;
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const body = bodyMatch ? bodyMatch[1] : html;
content = body
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.slice(0, 8000);
}
return JSON.stringify({ title, url, content }, null, 2);
},
};

View File

@@ -0,0 +1,414 @@
/**
* Widget Types for GooSeek - Perplexity-style widgets + Russian content
*/
// ============================================
// BASE TYPES
// ============================================
export type ContentType =
| 'product'
| 'video'
| 'profile'
| 'promo'
| 'article'
| 'image'
| 'knowledge_card';
export type SourcePlatform =
| 'yandex'
| 'vk'
| 'rutube'
| 'ozon'
| 'wildberries'
| 'aliexpress'
| 'youtube'
| 'telegram'
| 'dzen'
| 'other';
// ============================================
// PRODUCT DATA (Товары)
// ============================================
export interface ProductData {
id?: string;
title: string;
price: number;
currency: string;
oldPrice?: number;
discount?: number;
image: string;
url: string;
rating?: number;
reviewCount?: number;
seller?: string;
marketplace: 'ozon' | 'wildberries' | 'aliexpress' | 'yandex_market' | 'other';
inStock?: boolean;
deliveryInfo?: string;
badges?: string[];
}
// ============================================
// VIDEO DATA (Видео)
// ============================================
export interface VideoData {
id?: string;
title: string;
thumbnail: string;
url: string;
embedUrl?: string;
duration: number;
views?: number;
likes?: number;
author: string;
authorUrl?: string;
authorAvatar?: string;
platform: 'rutube' | 'vk' | 'youtube' | 'dzen' | 'other';
publishedAt?: string;
description?: string;
}
// ============================================
// PROFILE DATA (Профили)
// ============================================
export interface ProfileData {
id?: string;
name: string;
username?: string;
avatar?: string;
url: string;
platform: 'vk' | 'telegram' | 'instagram' | 'youtube' | 'dzen' | 'other';
followers?: number;
following?: number;
verified?: boolean;
description?: string;
isOnline?: boolean;
lastSeen?: string;
}
// ============================================
// PROMO CODE DATA (Промокоды)
// ============================================
export interface PromoData {
id?: string;
code: string;
discount: string;
discountType: 'percent' | 'fixed' | 'freeShipping' | 'other';
discountValue?: number;
store: string;
storeLogo?: string;
storeUrl?: string;
url: string;
expiresAt?: string;
conditions?: string;
minOrderAmount?: number;
usageCount?: number;
verified?: boolean;
}
// ============================================
// IMAGE DATA (Изображения)
// ============================================
export interface ImageData {
id?: string;
url: string;
thumbnailUrl?: string;
title?: string;
alt?: string;
source: string;
sourceUrl: string;
width?: number;
height?: number;
}
// ============================================
// KNOWLEDGE CARD DATA (Карточки знаний - Tako-style)
// ============================================
export type KnowledgeCardType =
| 'comparison_table'
| 'line_chart'
| 'bar_chart'
| 'pie_chart'
| 'stat_card'
| 'timeline'
| 'map'
| 'quote'
| 'definition';
export interface ComparisonTableData {
type: 'comparison_table';
columns: string[];
rows: Array<{
label: string;
values: (string | number | null)[];
highlight?: boolean;
}>;
footer?: string;
}
export interface ChartDataPoint {
label: string;
value: number;
color?: string;
}
export interface LineChartData {
type: 'line_chart';
title?: string;
xAxisLabel?: string;
yAxisLabel?: string;
series: Array<{
name: string;
data: Array<{ x: string | number; y: number }>;
color?: string;
}>;
}
export interface BarChartData {
type: 'bar_chart';
title?: string;
xAxisLabel?: string;
yAxisLabel?: string;
data: ChartDataPoint[];
horizontal?: boolean;
}
export interface PieChartData {
type: 'pie_chart';
title?: string;
data: ChartDataPoint[];
showPercent?: boolean;
}
export interface StatCardData {
type: 'stat_card';
stats: Array<{
label: string;
value: string | number;
change?: number;
changeType?: 'positive' | 'negative' | 'neutral';
icon?: string;
}>;
}
export interface TimelineData {
type: 'timeline';
events: Array<{
date: string;
title: string;
description?: string;
icon?: string;
}>;
}
export interface MapData {
type: 'map';
center: { lat: number; lng: number };
zoom?: number;
markers?: Array<{
lat: number;
lng: number;
title?: string;
description?: string;
}>;
}
export interface QuoteData {
type: 'quote';
text: string;
author?: string;
source?: string;
sourceUrl?: string;
}
export interface DefinitionData {
type: 'definition';
term: string;
definition: string;
examples?: string[];
synonyms?: string[];
source?: string;
}
export type KnowledgeCardData =
| ComparisonTableData
| LineChartData
| BarChartData
| PieChartData
| StatCardData
| TimelineData
| MapData
| QuoteData
| DefinitionData;
export interface KnowledgeCard {
id: string;
title: string;
data: KnowledgeCardData;
source?: string;
sourceUrl?: string;
lastUpdated?: string;
}
// ============================================
// SOURCE DATA (Источники - для citations)
// ============================================
export interface SourceData {
index: number;
url: string;
title: string;
domain: string;
favicon?: string;
snippet?: string;
publishedAt?: string;
author?: string;
contentType?: ContentType;
}
// ============================================
// WIDGET BLOCKS (для API response)
// ============================================
export interface ProductsWidgetData {
widgetType: 'products';
title?: string;
items: ProductData[];
query?: string;
}
export interface VideosWidgetData {
widgetType: 'videos';
title?: string;
items: VideoData[];
query?: string;
}
export interface ProfilesWidgetData {
widgetType: 'profiles';
title?: string;
items: ProfileData[];
}
export interface PromosWidgetData {
widgetType: 'promos';
title?: string;
items: PromoData[];
}
export interface ImageGalleryWidgetData {
widgetType: 'image_gallery';
images: ImageData[];
layout: 'grid' | 'carousel' | 'masonry';
}
export interface VideoEmbedWidgetData {
widgetType: 'video_embed';
video: VideoData;
autoplay?: boolean;
}
export interface KnowledgeCardWidgetData {
widgetType: 'knowledge_card';
card: KnowledgeCard;
}
export type RichWidgetData =
| ProductsWidgetData
| VideosWidgetData
| ProfilesWidgetData
| PromosWidgetData
| ImageGalleryWidgetData
| VideoEmbedWidgetData
| KnowledgeCardWidgetData;
// ============================================
// SEARCH RESULT CLASSIFICATION
// ============================================
export interface ClassifiedResult {
type: ContentType;
source: SourcePlatform;
confidence: number;
data: ProductData | VideoData | ProfileData | PromoData | ImageData | null;
rawResult: {
url: string;
title: string;
content?: string;
thumbnail?: string;
};
}
// ============================================
// HELPER FUNCTIONS
// ============================================
export function formatPrice(price: number, currency: string = '₽'): string {
return new Intl.NumberFormat('ru-RU').format(price) + ' ' + currency;
}
export function formatViews(views: number): string {
if (views >= 1_000_000) {
return (views / 1_000_000).toFixed(1) + 'M';
}
if (views >= 1_000) {
return (views / 1_000).toFixed(1) + 'K';
}
return views.toString();
}
export function formatDuration(seconds: number): string {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = seconds % 60;
if (hours > 0) {
return `${hours}:${minutes.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')}`;
}
return `${minutes}:${secs.toString().padStart(2, '0')}`;
}
export function formatFollowers(count: number): string {
if (count >= 1_000_000) {
return (count / 1_000_000).toFixed(1) + 'M подписчиков';
}
if (count >= 1_000) {
return (count / 1_000).toFixed(1) + 'K подписчиков';
}
return count + ' подписчиков';
}
export function getPlatformIcon(platform: SourcePlatform): string {
const icons: Record<SourcePlatform, string> = {
yandex: '🔍',
vk: '💙',
rutube: '🎬',
ozon: '🛒',
wildberries: '🟣',
aliexpress: '🛍️',
youtube: '▶️',
telegram: '✈️',
dzen: '📰',
other: '🌐',
};
return icons[platform] || '🌐';
}
export function getVideoEmbedUrl(video: VideoData): string {
switch (video.platform) {
case 'rutube':
return `https://rutube.ru/play/embed/${video.id}`;
case 'youtube':
return `https://www.youtube.com/embed/${video.id}`;
case 'vk':
return video.embedUrl || video.url;
default:
return video.url;
}
}