feat: Go backend, enhanced search, new widgets, Docker deploy
Major changes: - Add Go backend (backend/) with microservices architecture - Enhanced master-agents-svc: reranker, content-classifier, stealth-crawler, proxy-manager, media-search, fastClassifier, language detection - New web-svc widgets: KnowledgeCard, ProductCard, ProfileCard, VideoCard, UnifiedCard, CardGallery, InlineImageGallery, SourcesPanel, RelatedQuestions - Improved discover-svc with discover-db integration - Docker deployment improvements (Caddyfile, vendor.sh, BUILD.md) - Library-svc: project_id schema migration - Remove deprecated finance-svc and travel-svc - Localization improvements across services Made-with: Cursor
This commit is contained in:
343
backend/internal/files/analyzer.go
Normal file
343
backend/internal/files/analyzer.go
Normal file
@@ -0,0 +1,343 @@
|
||||
package files
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/gooseek/backend/internal/llm"
|
||||
"github.com/ledongthuc/pdf"
|
||||
)
|
||||
|
||||
type FileAnalyzer struct {
|
||||
llmClient llm.Client
|
||||
storagePath string
|
||||
}
|
||||
|
||||
type AnalysisResult struct {
|
||||
FileType string `json:"fileType"`
|
||||
ExtractedText string `json:"extractedText"`
|
||||
Summary string `json:"summary"`
|
||||
KeyPoints []string `json:"keyPoints"`
|
||||
Metadata map[string]interface{} `json:"metadata"`
|
||||
}
|
||||
|
||||
func NewFileAnalyzer(llmClient llm.Client, storagePath string) *FileAnalyzer {
|
||||
if storagePath == "" {
|
||||
storagePath = "/tmp/gooseek-files"
|
||||
}
|
||||
os.MkdirAll(storagePath, 0755)
|
||||
|
||||
return &FileAnalyzer{
|
||||
llmClient: llmClient,
|
||||
storagePath: storagePath,
|
||||
}
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) AnalyzeFile(ctx context.Context, filePath string, fileType string) (*AnalysisResult, error) {
|
||||
switch {
|
||||
case strings.HasPrefix(fileType, "application/pdf"):
|
||||
return fa.analyzePDF(ctx, filePath)
|
||||
case strings.HasPrefix(fileType, "image/"):
|
||||
return fa.analyzeImage(ctx, filePath, fileType)
|
||||
case strings.HasPrefix(fileType, "text/"):
|
||||
return fa.analyzeText(ctx, filePath)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported file type: %s", fileType)
|
||||
}
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) analyzePDF(ctx context.Context, filePath string) (*AnalysisResult, error) {
|
||||
text, metadata, err := extractPDFContent(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to extract PDF content: %w", err)
|
||||
}
|
||||
|
||||
if len(text) > 50000 {
|
||||
text = text[:50000] + "\n\n[Content truncated...]"
|
||||
}
|
||||
|
||||
summary, keyPoints, err := fa.generateSummary(ctx, text, "PDF document")
|
||||
if err != nil {
|
||||
summary = ""
|
||||
keyPoints = nil
|
||||
}
|
||||
|
||||
return &AnalysisResult{
|
||||
FileType: "pdf",
|
||||
ExtractedText: text,
|
||||
Summary: summary,
|
||||
KeyPoints: keyPoints,
|
||||
Metadata: metadata,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func extractPDFContent(filePath string) (string, map[string]interface{}, error) {
|
||||
f, r, err := pdf.Open(filePath)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var textBuilder strings.Builder
|
||||
numPages := r.NumPage()
|
||||
|
||||
for i := 1; i <= numPages; i++ {
|
||||
p := r.Page(i)
|
||||
if p.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
|
||||
text, err := p.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
textBuilder.WriteString(text)
|
||||
textBuilder.WriteString("\n\n")
|
||||
|
||||
if textBuilder.Len() > 100000 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"numPages": numPages,
|
||||
}
|
||||
|
||||
return textBuilder.String(), metadata, nil
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) analyzeImage(ctx context.Context, filePath string, mimeType string) (*AnalysisResult, error) {
|
||||
imageData, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read image: %w", err)
|
||||
}
|
||||
|
||||
base64Image := base64.StdEncoding.EncodeToString(imageData)
|
||||
|
||||
description, err := fa.describeImage(ctx, base64Image, mimeType)
|
||||
if err != nil {
|
||||
description = "Image analysis unavailable"
|
||||
}
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"size": len(imageData),
|
||||
}
|
||||
|
||||
return &AnalysisResult{
|
||||
FileType: "image",
|
||||
ExtractedText: description,
|
||||
Summary: description,
|
||||
KeyPoints: extractKeyPointsFromDescription(description),
|
||||
Metadata: metadata,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) describeImage(ctx context.Context, base64Image, mimeType string) (string, error) {
|
||||
prompt := `Analyze this image and provide:
|
||||
1. A detailed description of what's shown
|
||||
2. Any text visible in the image (OCR)
|
||||
3. Key elements and their relationships
|
||||
4. Any data, charts, or diagrams and their meaning
|
||||
|
||||
Be thorough but concise.`
|
||||
|
||||
messages := []llm.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: prompt,
|
||||
Images: []llm.ImageContent{
|
||||
{
|
||||
Type: mimeType,
|
||||
Data: base64Image,
|
||||
IsBase64: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := fa.llmClient.GenerateText(ctx, llm.StreamRequest{
|
||||
Messages: messages,
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) analyzeText(ctx context.Context, filePath string) (*AnalysisResult, error) {
|
||||
content, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read file: %w", err)
|
||||
}
|
||||
|
||||
text := string(content)
|
||||
if len(text) > 50000 {
|
||||
text = text[:50000] + "\n\n[Content truncated...]"
|
||||
}
|
||||
|
||||
summary, keyPoints, err := fa.generateSummary(ctx, text, "text document")
|
||||
if err != nil {
|
||||
summary = ""
|
||||
keyPoints = nil
|
||||
}
|
||||
|
||||
return &AnalysisResult{
|
||||
FileType: "text",
|
||||
ExtractedText: text,
|
||||
Summary: summary,
|
||||
KeyPoints: keyPoints,
|
||||
Metadata: map[string]interface{}{
|
||||
"size": len(content),
|
||||
"lineCount": strings.Count(text, "\n") + 1,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) generateSummary(ctx context.Context, text, docType string) (string, []string, error) {
|
||||
if len(text) < 100 {
|
||||
return text, nil, nil
|
||||
}
|
||||
|
||||
truncatedText := text
|
||||
if len(text) > 15000 {
|
||||
truncatedText = text[:15000] + "\n\n[Content truncated for analysis...]"
|
||||
}
|
||||
|
||||
prompt := fmt.Sprintf(`Analyze this %s and provide:
|
||||
|
||||
1. A concise summary (2-3 paragraphs)
|
||||
2. 5-7 key points as bullet points
|
||||
|
||||
Document content:
|
||||
%s
|
||||
|
||||
Format your response as:
|
||||
SUMMARY:
|
||||
[your summary here]
|
||||
|
||||
KEY POINTS:
|
||||
- [point 1]
|
||||
- [point 2]
|
||||
...`, docType, truncatedText)
|
||||
|
||||
result, err := fa.llmClient.GenerateText(ctx, llm.StreamRequest{
|
||||
Messages: []llm.Message{
|
||||
{Role: llm.RoleUser, Content: prompt},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
summary, keyPoints := parseSummaryResponse(result)
|
||||
return summary, keyPoints, nil
|
||||
}
|
||||
|
||||
func parseSummaryResponse(response string) (string, []string) {
|
||||
var summary string
|
||||
var keyPoints []string
|
||||
|
||||
parts := strings.Split(response, "KEY POINTS:")
|
||||
if len(parts) >= 2 {
|
||||
summaryPart := strings.TrimPrefix(parts[0], "SUMMARY:")
|
||||
summary = strings.TrimSpace(summaryPart)
|
||||
|
||||
keyPointsPart := parts[1]
|
||||
for _, line := range strings.Split(keyPointsPart, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if strings.HasPrefix(line, "-") || strings.HasPrefix(line, "•") || strings.HasPrefix(line, "*") {
|
||||
point := strings.TrimPrefix(strings.TrimPrefix(strings.TrimPrefix(line, "-"), "•"), "*")
|
||||
point = strings.TrimSpace(point)
|
||||
if point != "" {
|
||||
keyPoints = append(keyPoints, point)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
summary = response
|
||||
}
|
||||
|
||||
return summary, keyPoints
|
||||
}
|
||||
|
||||
func extractKeyPointsFromDescription(description string) []string {
|
||||
var points []string
|
||||
sentences := strings.Split(description, ".")
|
||||
|
||||
for i, s := range sentences {
|
||||
s = strings.TrimSpace(s)
|
||||
if len(s) > 20 && i < 5 {
|
||||
points = append(points, s+".")
|
||||
}
|
||||
}
|
||||
|
||||
return points
|
||||
}
|
||||
|
||||
func DetectMimeType(filename string, content []byte) string {
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
switch ext {
|
||||
case ".pdf":
|
||||
return "application/pdf"
|
||||
case ".png":
|
||||
return "image/png"
|
||||
case ".jpg", ".jpeg":
|
||||
return "image/jpeg"
|
||||
case ".gif":
|
||||
return "image/gif"
|
||||
case ".webp":
|
||||
return "image/webp"
|
||||
case ".txt":
|
||||
return "text/plain"
|
||||
case ".md":
|
||||
return "text/markdown"
|
||||
case ".csv":
|
||||
return "text/csv"
|
||||
case ".json":
|
||||
return "application/json"
|
||||
default:
|
||||
return http.DetectContentType(content[:min(512, len(content))])
|
||||
}
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) SaveFile(filename string, content io.Reader) (string, int64, error) {
|
||||
safeName := filepath.Base(filename)
|
||||
destPath := filepath.Join(fa.storagePath, safeName)
|
||||
|
||||
file, err := os.Create(destPath)
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var buf bytes.Buffer
|
||||
size, err := io.Copy(io.MultiWriter(file, &buf), content)
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
}
|
||||
|
||||
return destPath, size, nil
|
||||
}
|
||||
|
||||
func (fa *FileAnalyzer) DeleteFile(filePath string) error {
|
||||
if !strings.HasPrefix(filePath, fa.storagePath) {
|
||||
return fmt.Errorf("invalid file path")
|
||||
}
|
||||
return os.Remove(filePath)
|
||||
}
|
||||
Reference in New Issue
Block a user