package files import ( "bytes" "context" "encoding/base64" "fmt" "io" "net/http" "os" "path/filepath" "strings" "github.com/gooseek/backend/internal/llm" "github.com/ledongthuc/pdf" ) type FileAnalyzer struct { llmClient llm.Client storagePath string } type AnalysisResult struct { FileType string `json:"fileType"` ExtractedText string `json:"extractedText"` Summary string `json:"summary"` KeyPoints []string `json:"keyPoints"` Metadata map[string]interface{} `json:"metadata"` } func NewFileAnalyzer(llmClient llm.Client, storagePath string) *FileAnalyzer { if storagePath == "" { storagePath = "/tmp/gooseek-files" } os.MkdirAll(storagePath, 0755) return &FileAnalyzer{ llmClient: llmClient, storagePath: storagePath, } } func (fa *FileAnalyzer) AnalyzeFile(ctx context.Context, filePath string, fileType string) (*AnalysisResult, error) { switch { case strings.HasPrefix(fileType, "application/pdf"): return fa.analyzePDF(ctx, filePath) case strings.HasPrefix(fileType, "image/"): return fa.analyzeImage(ctx, filePath, fileType) case strings.HasPrefix(fileType, "text/"): return fa.analyzeText(ctx, filePath) default: return nil, fmt.Errorf("unsupported file type: %s", fileType) } } func (fa *FileAnalyzer) analyzePDF(ctx context.Context, filePath string) (*AnalysisResult, error) { text, metadata, err := extractPDFContent(filePath) if err != nil { return nil, fmt.Errorf("failed to extract PDF content: %w", err) } if len(text) > 50000 { text = text[:50000] + "\n\n[Content truncated...]" } summary, keyPoints, err := fa.generateSummary(ctx, text, "PDF document") if err != nil { summary = "" keyPoints = nil } return &AnalysisResult{ FileType: "pdf", ExtractedText: text, Summary: summary, KeyPoints: keyPoints, Metadata: metadata, }, nil } func extractPDFContent(filePath string) (string, map[string]interface{}, error) { f, r, err := pdf.Open(filePath) if err != nil { return "", nil, err } defer f.Close() var textBuilder strings.Builder numPages := r.NumPage() for i := 1; i <= numPages; i++ { p := r.Page(i) if p.V.IsNull() { continue } text, err := p.GetPlainText(nil) if err != nil { continue } textBuilder.WriteString(text) textBuilder.WriteString("\n\n") if textBuilder.Len() > 100000 { break } } metadata := map[string]interface{}{ "numPages": numPages, } return textBuilder.String(), metadata, nil } func (fa *FileAnalyzer) analyzeImage(ctx context.Context, filePath string, mimeType string) (*AnalysisResult, error) { imageData, err := os.ReadFile(filePath) if err != nil { return nil, fmt.Errorf("failed to read image: %w", err) } base64Image := base64.StdEncoding.EncodeToString(imageData) description, err := fa.describeImage(ctx, base64Image, mimeType) if err != nil { description = "Image analysis unavailable" } metadata := map[string]interface{}{ "size": len(imageData), } return &AnalysisResult{ FileType: "image", ExtractedText: description, Summary: description, KeyPoints: extractKeyPointsFromDescription(description), Metadata: metadata, }, nil } func (fa *FileAnalyzer) describeImage(ctx context.Context, base64Image, mimeType string) (string, error) { prompt := `Analyze this image and provide: 1. A detailed description of what's shown 2. Any text visible in the image (OCR) 3. Key elements and their relationships 4. Any data, charts, or diagrams and their meaning Be thorough but concise.` messages := []llm.Message{ { Role: "user", Content: prompt, Images: []llm.ImageContent{ { Type: mimeType, Data: base64Image, IsBase64: true, }, }, }, } result, err := fa.llmClient.GenerateText(ctx, llm.StreamRequest{ Messages: messages, }) if err != nil { return "", err } return result, nil } func (fa *FileAnalyzer) analyzeText(ctx context.Context, filePath string) (*AnalysisResult, error) { content, err := os.ReadFile(filePath) if err != nil { return nil, fmt.Errorf("failed to read file: %w", err) } text := string(content) if len(text) > 50000 { text = text[:50000] + "\n\n[Content truncated...]" } summary, keyPoints, err := fa.generateSummary(ctx, text, "text document") if err != nil { summary = "" keyPoints = nil } return &AnalysisResult{ FileType: "text", ExtractedText: text, Summary: summary, KeyPoints: keyPoints, Metadata: map[string]interface{}{ "size": len(content), "lineCount": strings.Count(text, "\n") + 1, }, }, nil } func (fa *FileAnalyzer) generateSummary(ctx context.Context, text, docType string) (string, []string, error) { if len(text) < 100 { return text, nil, nil } truncatedText := text if len(text) > 15000 { truncatedText = text[:15000] + "\n\n[Content truncated for analysis...]" } prompt := fmt.Sprintf(`Analyze this %s and provide: 1. A concise summary (2-3 paragraphs) 2. 5-7 key points as bullet points Document content: %s Format your response as: SUMMARY: [your summary here] KEY POINTS: - [point 1] - [point 2] ...`, docType, truncatedText) result, err := fa.llmClient.GenerateText(ctx, llm.StreamRequest{ Messages: []llm.Message{ {Role: llm.RoleUser, Content: prompt}, }, }) if err != nil { return "", nil, err } summary, keyPoints := parseSummaryResponse(result) return summary, keyPoints, nil } func parseSummaryResponse(response string) (string, []string) { var summary string var keyPoints []string parts := strings.Split(response, "KEY POINTS:") if len(parts) >= 2 { summaryPart := strings.TrimPrefix(parts[0], "SUMMARY:") summary = strings.TrimSpace(summaryPart) keyPointsPart := parts[1] for _, line := range strings.Split(keyPointsPart, "\n") { line = strings.TrimSpace(line) if strings.HasPrefix(line, "-") || strings.HasPrefix(line, "•") || strings.HasPrefix(line, "*") { point := strings.TrimPrefix(strings.TrimPrefix(strings.TrimPrefix(line, "-"), "•"), "*") point = strings.TrimSpace(point) if point != "" { keyPoints = append(keyPoints, point) } } } } else { summary = response } return summary, keyPoints } func extractKeyPointsFromDescription(description string) []string { var points []string sentences := strings.Split(description, ".") for i, s := range sentences { s = strings.TrimSpace(s) if len(s) > 20 && i < 5 { points = append(points, s+".") } } return points } func DetectMimeType(filename string, content []byte) string { ext := strings.ToLower(filepath.Ext(filename)) switch ext { case ".pdf": return "application/pdf" case ".png": return "image/png" case ".jpg", ".jpeg": return "image/jpeg" case ".gif": return "image/gif" case ".webp": return "image/webp" case ".txt": return "text/plain" case ".md": return "text/markdown" case ".csv": return "text/csv" case ".json": return "application/json" default: return http.DetectContentType(content[:min(512, len(content))]) } } func min(a, b int) int { if a < b { return a } return b } func (fa *FileAnalyzer) SaveFile(filename string, content io.Reader) (string, int64, error) { safeName := filepath.Base(filename) destPath := filepath.Join(fa.storagePath, safeName) file, err := os.Create(destPath) if err != nil { return "", 0, err } defer file.Close() var buf bytes.Buffer size, err := io.Copy(io.MultiWriter(file, &buf), content) if err != nil { return "", 0, err } return destPath, size, nil } func (fa *FileAnalyzer) DeleteFile(filePath string) error { if !strings.HasPrefix(filePath, fa.storagePath) { return fmt.Errorf("invalid file path") } return os.Remove(filePath) }