feat: монорепо миграция, Discover/SearxNG улучшения
- Миграция на монорепозиторий (apps/frontend, apps/chat-service, etc.) - Discover: проверка SearxNG, понятное empty state при ненастроенном поиске - searxng.ts: валидация URL, проверка JSON-ответа, авто-добавление http:// - docker/searxng-config: настройки для JSON API SearxNG Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
218
apps/uploads-service/src/lib/uploads/manager.ts
Normal file
218
apps/uploads-service/src/lib/uploads/manager.ts
Normal file
@@ -0,0 +1,218 @@
|
||||
import path from "path";
|
||||
import BaseEmbedding from "../models/base/embedding"
|
||||
import crypto from "crypto"
|
||||
import fs from 'fs';
|
||||
import { splitText } from '../../../../shared-utils/src/splitText';
|
||||
import { PDFParse } from 'pdf-parse';
|
||||
import { CanvasFactory } from 'pdf-parse/worker';
|
||||
import officeParser from 'officeparser'
|
||||
|
||||
const supportedMimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain'] as const
|
||||
|
||||
type SupportedMimeType = typeof supportedMimeTypes[number];
|
||||
|
||||
type UploadManagerParams = {
|
||||
embeddingModel: BaseEmbedding<any>;
|
||||
}
|
||||
|
||||
type RecordedFile = {
|
||||
id: string;
|
||||
name: string;
|
||||
filePath: string;
|
||||
contentPath: string;
|
||||
uploadedAt: string;
|
||||
}
|
||||
|
||||
type FileRes = {
|
||||
fileName: string;
|
||||
fileExtension: string;
|
||||
fileId: string;
|
||||
}
|
||||
|
||||
class UploadManager {
|
||||
private embeddingModel: BaseEmbedding<any>;
|
||||
static uploadsDir = path.join(process.cwd(), 'data', 'uploads');
|
||||
static uploadedFilesRecordPath = path.join(this.uploadsDir, 'uploaded_files.json');
|
||||
|
||||
constructor(private params: UploadManagerParams) {
|
||||
this.embeddingModel = params.embeddingModel;
|
||||
|
||||
if (!fs.existsSync(UploadManager.uploadsDir)) {
|
||||
fs.mkdirSync(UploadManager.uploadsDir, { recursive: true });
|
||||
}
|
||||
|
||||
if (!fs.existsSync(UploadManager.uploadedFilesRecordPath)) {
|
||||
const data = {
|
||||
files: []
|
||||
}
|
||||
|
||||
fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify(data, null, 2));
|
||||
}
|
||||
}
|
||||
|
||||
private static getRecordedFiles(): RecordedFile[] {
|
||||
const data = fs.readFileSync(UploadManager.uploadedFilesRecordPath, 'utf-8');
|
||||
return JSON.parse(data).files;
|
||||
}
|
||||
|
||||
private static addNewRecordedFile(fileRecord: RecordedFile) {
|
||||
const currentData = this.getRecordedFiles()
|
||||
|
||||
currentData.push(fileRecord);
|
||||
|
||||
fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify({ files: currentData }, null, 2));
|
||||
}
|
||||
|
||||
static getFile(fileId: string): RecordedFile | null {
|
||||
const recordedFiles = this.getRecordedFiles();
|
||||
|
||||
return recordedFiles.find(f => f.id === fileId) || null;
|
||||
}
|
||||
|
||||
static getFileChunks(fileId: string): { content: string; embedding: number[] }[] {
|
||||
try {
|
||||
const recordedFile = this.getFile(fileId);
|
||||
|
||||
if (!recordedFile) {
|
||||
throw new Error(`File with ID ${fileId} not found`);
|
||||
}
|
||||
|
||||
const contentData = JSON.parse(fs.readFileSync(recordedFile.contentPath, 'utf-8'))
|
||||
|
||||
return contentData.chunks;
|
||||
} catch (err) {
|
||||
console.log('Error getting file chunks:', err);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async extractContentAndEmbed(filePath: string, fileType: SupportedMimeType): Promise<string> {
|
||||
switch (fileType) {
|
||||
case 'text/plain':
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
|
||||
const splittedText = splitText(content, 512, 128)
|
||||
const embeddings = await this.embeddingModel.embedText(splittedText)
|
||||
|
||||
if (embeddings.length !== splittedText.length) {
|
||||
throw new Error('Embeddings and text chunks length mismatch');
|
||||
}
|
||||
|
||||
const contentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
|
||||
|
||||
const data = {
|
||||
chunks: splittedText.map((text, i) => {
|
||||
return {
|
||||
content: text,
|
||||
embedding: embeddings[i],
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fs.writeFileSync(contentPath, JSON.stringify(data, null, 2));
|
||||
|
||||
return contentPath;
|
||||
case 'application/pdf':
|
||||
const pdfBuffer = fs.readFileSync(filePath);
|
||||
|
||||
const parser = new PDFParse({
|
||||
data: pdfBuffer,
|
||||
CanvasFactory
|
||||
})
|
||||
|
||||
const pdfText = await parser.getText().then(res => res.text)
|
||||
|
||||
const pdfSplittedText = splitText(pdfText, 512, 128)
|
||||
const pdfEmbeddings = await this.embeddingModel.embedText(pdfSplittedText)
|
||||
|
||||
if (pdfEmbeddings.length !== pdfSplittedText.length) {
|
||||
throw new Error('Embeddings and text chunks length mismatch');
|
||||
}
|
||||
|
||||
const pdfContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
|
||||
|
||||
const pdfData = {
|
||||
chunks: pdfSplittedText.map((text, i) => {
|
||||
return {
|
||||
content: text,
|
||||
embedding: pdfEmbeddings[i],
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fs.writeFileSync(pdfContentPath, JSON.stringify(pdfData, null, 2));
|
||||
|
||||
return pdfContentPath;
|
||||
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
||||
const docBuffer = fs.readFileSync(filePath);
|
||||
|
||||
const docText = await officeParser.parseOfficeAsync(docBuffer)
|
||||
|
||||
const docSplittedText = splitText(docText, 512, 128)
|
||||
const docEmbeddings = await this.embeddingModel.embedText(docSplittedText)
|
||||
|
||||
if (docEmbeddings.length !== docSplittedText.length) {
|
||||
throw new Error('Embeddings and text chunks length mismatch');
|
||||
}
|
||||
|
||||
const docContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
|
||||
|
||||
const docData = {
|
||||
chunks: docSplittedText.map((text, i) => {
|
||||
return {
|
||||
content: text,
|
||||
embedding: docEmbeddings[i],
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fs.writeFileSync(docContentPath, JSON.stringify(docData, null, 2));
|
||||
|
||||
return docContentPath;
|
||||
default:
|
||||
throw new Error(`Unsupported file type: ${fileType}`);
|
||||
}
|
||||
}
|
||||
|
||||
async processFiles(files: File[]): Promise<FileRes[]> {
|
||||
const processedFiles: FileRes[] = [];
|
||||
|
||||
await Promise.all(files.map(async (file) => {
|
||||
if (!(supportedMimeTypes as unknown as string[]).includes(file.type)) {
|
||||
throw new Error(`File type ${file.type} not supported`);
|
||||
}
|
||||
|
||||
const fileId = crypto.randomBytes(16).toString('hex');
|
||||
|
||||
const fileExtension = file.name.split('.').pop();
|
||||
const fileName = `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`;
|
||||
const filePath = path.join(UploadManager.uploadsDir, fileName);
|
||||
|
||||
const buffer = Buffer.from(await file.arrayBuffer())
|
||||
|
||||
fs.writeFileSync(filePath, buffer);
|
||||
|
||||
const contentFilePath = await this.extractContentAndEmbed(filePath, file.type as SupportedMimeType);
|
||||
|
||||
const fileRecord: RecordedFile = {
|
||||
id: fileId,
|
||||
name: file.name,
|
||||
filePath: filePath,
|
||||
contentPath: contentFilePath,
|
||||
uploadedAt: new Date().toISOString(),
|
||||
}
|
||||
|
||||
UploadManager.addNewRecordedFile(fileRecord);
|
||||
|
||||
processedFiles.push({
|
||||
fileExtension: fileExtension || '',
|
||||
fileId,
|
||||
fileName: file.name
|
||||
});
|
||||
}))
|
||||
|
||||
return processedFiles;
|
||||
}
|
||||
}
|
||||
|
||||
export default UploadManager;
|
||||
122
apps/uploads-service/src/lib/uploads/store.ts
Normal file
122
apps/uploads-service/src/lib/uploads/store.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
import BaseEmbedding from "../models/base/embedding";
|
||||
import UploadManager from "./manager";
|
||||
import computeSimilarity from '../../../../shared-utils/src/computeSimilarity';
|
||||
import { hashObj } from '../../../../shared-utils/src/serverUtils';
|
||||
import type { Chunk } from '../../../../shared-types/src/types';
|
||||
import fs from 'fs';
|
||||
|
||||
type UploadStoreParams = {
|
||||
embeddingModel: BaseEmbedding<any>;
|
||||
fileIds: string[];
|
||||
}
|
||||
|
||||
type StoreRecord = {
|
||||
embedding: number[];
|
||||
content: string;
|
||||
fileId: string;
|
||||
metadata: Record<string, any>
|
||||
}
|
||||
|
||||
class UploadStore {
|
||||
embeddingModel: BaseEmbedding<any>;
|
||||
fileIds: string[];
|
||||
records: StoreRecord[] = [];
|
||||
|
||||
constructor(private params: UploadStoreParams) {
|
||||
this.embeddingModel = params.embeddingModel;
|
||||
this.fileIds = params.fileIds;
|
||||
this.initializeStore()
|
||||
}
|
||||
|
||||
initializeStore() {
|
||||
this.fileIds.forEach((fileId) => {
|
||||
const file = UploadManager.getFile(fileId)
|
||||
|
||||
if (!file) {
|
||||
throw new Error(`File with ID ${fileId} not found`);
|
||||
}
|
||||
|
||||
const chunks = UploadManager.getFileChunks(fileId);
|
||||
|
||||
this.records.push(...chunks.map((chunk) => ({
|
||||
embedding: chunk.embedding,
|
||||
content: chunk.content,
|
||||
fileId: fileId,
|
||||
metadata: {
|
||||
fileName: file.name,
|
||||
title: file.name,
|
||||
url: `file_id://${file.id}`,
|
||||
}
|
||||
})))
|
||||
})
|
||||
}
|
||||
|
||||
async query(queries: string[], topK: number): Promise<Chunk[]> {
|
||||
const queryEmbeddings = await this.embeddingModel.embedText(queries)
|
||||
|
||||
const results: { chunk: Chunk; score: number; }[][] = [];
|
||||
const hashResults: string[][] = []
|
||||
|
||||
await Promise.all(queryEmbeddings.map(async (query) => {
|
||||
const similarities = this.records.map((record, idx) => {
|
||||
return {
|
||||
chunk: {
|
||||
content: record.content,
|
||||
metadata: {
|
||||
...record.metadata,
|
||||
fileId: record.fileId,
|
||||
}
|
||||
},
|
||||
score: computeSimilarity(query, record.embedding)
|
||||
} as { chunk: Chunk; score: number; };
|
||||
}).sort((a, b) => b.score - a.score)
|
||||
|
||||
results.push(similarities)
|
||||
hashResults.push(similarities.map(s => hashObj(s)))
|
||||
}))
|
||||
|
||||
const chunkMap: Map<string, Chunk> = new Map();
|
||||
const scoreMap: Map<string, number> = new Map();
|
||||
const k = 60;
|
||||
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
for (let j = 0; j < results[i].length; j++) {
|
||||
const chunkHash = hashResults[i][j]
|
||||
|
||||
chunkMap.set(chunkHash, results[i][j].chunk);
|
||||
scoreMap.set(chunkHash, (scoreMap.get(chunkHash) || 0) + results[i][j].score / (j + 1 + k));
|
||||
}
|
||||
}
|
||||
|
||||
const finalResults = Array.from(scoreMap.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.map(([chunkHash, _score]) => {
|
||||
return chunkMap.get(chunkHash)!;
|
||||
})
|
||||
|
||||
return finalResults.slice(0, topK);
|
||||
}
|
||||
|
||||
static getFileData(fileIds: string[]): { fileName: string; initialContent: string }[] {
|
||||
const filesData: { fileName: string; initialContent: string }[] = [];
|
||||
|
||||
fileIds.forEach((fileId) => {
|
||||
const file = UploadManager.getFile(fileId)
|
||||
|
||||
if (!file) {
|
||||
throw new Error(`File with ID ${fileId} not found`);
|
||||
}
|
||||
|
||||
const chunks = UploadManager.getFileChunks(fileId);
|
||||
|
||||
filesData.push({
|
||||
fileName: file.name,
|
||||
initialContent: chunks.slice(0, 3).map(c => c.content).join('\n---\n'),
|
||||
})
|
||||
})
|
||||
|
||||
return filesData
|
||||
}
|
||||
}
|
||||
|
||||
export default UploadStore
|
||||
Reference in New Issue
Block a user