feat: монорепо миграция, Discover/SearxNG улучшения

- Миграция на монорепозиторий (apps/frontend, apps/chat-service, etc.)
- Discover: проверка SearxNG, понятное empty state при ненастроенном поиске
- searxng.ts: валидация URL, проверка JSON-ответа, авто-добавление http://
- docker/searxng-config: настройки для JSON API SearxNG

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
home
2026-02-20 17:03:43 +03:00
parent c839a0c472
commit 783569b8e7
344 changed files with 28299 additions and 6034 deletions

View File

@@ -0,0 +1,218 @@
import path from "path";
import BaseEmbedding from "../models/base/embedding"
import crypto from "crypto"
import fs from 'fs';
import { splitText } from '../../../../shared-utils/src/splitText';
import { PDFParse } from 'pdf-parse';
import { CanvasFactory } from 'pdf-parse/worker';
import officeParser from 'officeparser'
const supportedMimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain'] as const
type SupportedMimeType = typeof supportedMimeTypes[number];
type UploadManagerParams = {
embeddingModel: BaseEmbedding<any>;
}
type RecordedFile = {
id: string;
name: string;
filePath: string;
contentPath: string;
uploadedAt: string;
}
type FileRes = {
fileName: string;
fileExtension: string;
fileId: string;
}
class UploadManager {
private embeddingModel: BaseEmbedding<any>;
static uploadsDir = path.join(process.cwd(), 'data', 'uploads');
static uploadedFilesRecordPath = path.join(this.uploadsDir, 'uploaded_files.json');
constructor(private params: UploadManagerParams) {
this.embeddingModel = params.embeddingModel;
if (!fs.existsSync(UploadManager.uploadsDir)) {
fs.mkdirSync(UploadManager.uploadsDir, { recursive: true });
}
if (!fs.existsSync(UploadManager.uploadedFilesRecordPath)) {
const data = {
files: []
}
fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify(data, null, 2));
}
}
private static getRecordedFiles(): RecordedFile[] {
const data = fs.readFileSync(UploadManager.uploadedFilesRecordPath, 'utf-8');
return JSON.parse(data).files;
}
private static addNewRecordedFile(fileRecord: RecordedFile) {
const currentData = this.getRecordedFiles()
currentData.push(fileRecord);
fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify({ files: currentData }, null, 2));
}
static getFile(fileId: string): RecordedFile | null {
const recordedFiles = this.getRecordedFiles();
return recordedFiles.find(f => f.id === fileId) || null;
}
static getFileChunks(fileId: string): { content: string; embedding: number[] }[] {
try {
const recordedFile = this.getFile(fileId);
if (!recordedFile) {
throw new Error(`File with ID ${fileId} not found`);
}
const contentData = JSON.parse(fs.readFileSync(recordedFile.contentPath, 'utf-8'))
return contentData.chunks;
} catch (err) {
console.log('Error getting file chunks:', err);
return [];
}
}
private async extractContentAndEmbed(filePath: string, fileType: SupportedMimeType): Promise<string> {
switch (fileType) {
case 'text/plain':
const content = fs.readFileSync(filePath, 'utf-8');
const splittedText = splitText(content, 512, 128)
const embeddings = await this.embeddingModel.embedText(splittedText)
if (embeddings.length !== splittedText.length) {
throw new Error('Embeddings and text chunks length mismatch');
}
const contentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
const data = {
chunks: splittedText.map((text, i) => {
return {
content: text,
embedding: embeddings[i],
}
})
}
fs.writeFileSync(contentPath, JSON.stringify(data, null, 2));
return contentPath;
case 'application/pdf':
const pdfBuffer = fs.readFileSync(filePath);
const parser = new PDFParse({
data: pdfBuffer,
CanvasFactory
})
const pdfText = await parser.getText().then(res => res.text)
const pdfSplittedText = splitText(pdfText, 512, 128)
const pdfEmbeddings = await this.embeddingModel.embedText(pdfSplittedText)
if (pdfEmbeddings.length !== pdfSplittedText.length) {
throw new Error('Embeddings and text chunks length mismatch');
}
const pdfContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
const pdfData = {
chunks: pdfSplittedText.map((text, i) => {
return {
content: text,
embedding: pdfEmbeddings[i],
}
})
}
fs.writeFileSync(pdfContentPath, JSON.stringify(pdfData, null, 2));
return pdfContentPath;
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
const docBuffer = fs.readFileSync(filePath);
const docText = await officeParser.parseOfficeAsync(docBuffer)
const docSplittedText = splitText(docText, 512, 128)
const docEmbeddings = await this.embeddingModel.embedText(docSplittedText)
if (docEmbeddings.length !== docSplittedText.length) {
throw new Error('Embeddings and text chunks length mismatch');
}
const docContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
const docData = {
chunks: docSplittedText.map((text, i) => {
return {
content: text,
embedding: docEmbeddings[i],
}
})
}
fs.writeFileSync(docContentPath, JSON.stringify(docData, null, 2));
return docContentPath;
default:
throw new Error(`Unsupported file type: ${fileType}`);
}
}
async processFiles(files: File[]): Promise<FileRes[]> {
const processedFiles: FileRes[] = [];
await Promise.all(files.map(async (file) => {
if (!(supportedMimeTypes as unknown as string[]).includes(file.type)) {
throw new Error(`File type ${file.type} not supported`);
}
const fileId = crypto.randomBytes(16).toString('hex');
const fileExtension = file.name.split('.').pop();
const fileName = `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`;
const filePath = path.join(UploadManager.uploadsDir, fileName);
const buffer = Buffer.from(await file.arrayBuffer())
fs.writeFileSync(filePath, buffer);
const contentFilePath = await this.extractContentAndEmbed(filePath, file.type as SupportedMimeType);
const fileRecord: RecordedFile = {
id: fileId,
name: file.name,
filePath: filePath,
contentPath: contentFilePath,
uploadedAt: new Date().toISOString(),
}
UploadManager.addNewRecordedFile(fileRecord);
processedFiles.push({
fileExtension: fileExtension || '',
fileId,
fileName: file.name
});
}))
return processedFiles;
}
}
export default UploadManager;

View File

@@ -0,0 +1,122 @@
import BaseEmbedding from "../models/base/embedding";
import UploadManager from "./manager";
import computeSimilarity from '../../../../shared-utils/src/computeSimilarity';
import { hashObj } from '../../../../shared-utils/src/serverUtils';
import type { Chunk } from '../../../../shared-types/src/types';
import fs from 'fs';
type UploadStoreParams = {
embeddingModel: BaseEmbedding<any>;
fileIds: string[];
}
type StoreRecord = {
embedding: number[];
content: string;
fileId: string;
metadata: Record<string, any>
}
class UploadStore {
embeddingModel: BaseEmbedding<any>;
fileIds: string[];
records: StoreRecord[] = [];
constructor(private params: UploadStoreParams) {
this.embeddingModel = params.embeddingModel;
this.fileIds = params.fileIds;
this.initializeStore()
}
initializeStore() {
this.fileIds.forEach((fileId) => {
const file = UploadManager.getFile(fileId)
if (!file) {
throw new Error(`File with ID ${fileId} not found`);
}
const chunks = UploadManager.getFileChunks(fileId);
this.records.push(...chunks.map((chunk) => ({
embedding: chunk.embedding,
content: chunk.content,
fileId: fileId,
metadata: {
fileName: file.name,
title: file.name,
url: `file_id://${file.id}`,
}
})))
})
}
async query(queries: string[], topK: number): Promise<Chunk[]> {
const queryEmbeddings = await this.embeddingModel.embedText(queries)
const results: { chunk: Chunk; score: number; }[][] = [];
const hashResults: string[][] = []
await Promise.all(queryEmbeddings.map(async (query) => {
const similarities = this.records.map((record, idx) => {
return {
chunk: {
content: record.content,
metadata: {
...record.metadata,
fileId: record.fileId,
}
},
score: computeSimilarity(query, record.embedding)
} as { chunk: Chunk; score: number; };
}).sort((a, b) => b.score - a.score)
results.push(similarities)
hashResults.push(similarities.map(s => hashObj(s)))
}))
const chunkMap: Map<string, Chunk> = new Map();
const scoreMap: Map<string, number> = new Map();
const k = 60;
for (let i = 0; i < results.length; i++) {
for (let j = 0; j < results[i].length; j++) {
const chunkHash = hashResults[i][j]
chunkMap.set(chunkHash, results[i][j].chunk);
scoreMap.set(chunkHash, (scoreMap.get(chunkHash) || 0) + results[i][j].score / (j + 1 + k));
}
}
const finalResults = Array.from(scoreMap.entries())
.sort((a, b) => b[1] - a[1])
.map(([chunkHash, _score]) => {
return chunkMap.get(chunkHash)!;
})
return finalResults.slice(0, topK);
}
static getFileData(fileIds: string[]): { fileName: string; initialContent: string }[] {
const filesData: { fileName: string; initialContent: string }[] = [];
fileIds.forEach((fileId) => {
const file = UploadManager.getFile(fileId)
if (!file) {
throw new Error(`File with ID ${fileId} not found`);
}
const chunks = UploadManager.getFileChunks(fileId);
filesData.push({
fileName: file.name,
initialContent: chunks.slice(0, 3).map(c => c.content).join('\n---\n'),
})
})
return filesData
}
}
export default UploadStore