File size: 29,036 Bytes

373c769

const express = require('express');
const cors = require('cors');
const multer = require('multer');
const sharp = require('sharp');
const { GoogleGenerativeAI } = require('@google/generative-ai');
const fs = require('fs');
const path = require('path');

// Server-side API key (more secure for public deployment)
const GEMINI_API_KEY = process.env.GEMINI_API_KEY || null;
const pdf = require('pdf-poppler');

const app = express();
const PORT = process.env.PORT || 3002;

// Cleanup function for temporary files
function cleanupTempFiles() {
    const uploadsDir = path.join(__dirname, 'uploads');

    try {
        if (!fs.existsSync(uploadsDir)) {
            fs.mkdirSync(uploadsDir, { recursive: true });
            console.log('📁 Created uploads directory');
            return;
        }

        const files = fs.readdirSync(uploadsDir);
        let cleanedCount = 0;

        files.forEach(file => {
            const filePath = path.join(uploadsDir, file);
            const stats = fs.statSync(filePath);
            const now = new Date();
            const fileAge = now - stats.mtime; // Age in milliseconds
            const maxAge = 30 * 60 * 1000; // 30 minutes in milliseconds

            // Clean up files older than 30 minutes
            if (fileAge > maxAge) {
                try {
                    fs.unlinkSync(filePath);
                    cleanedCount++;
                    console.log(`🗑️  Cleaned up old temp file: ${file}`);
                } catch (error) {
                    console.warn(`⚠️  Could not delete ${file}:`, error.message);
                }
            }
        });

        if (cleanedCount > 0) {
            console.log(`✅ Cleaned up ${cleanedCount} temporary files`);
        } else {
            console.log('✅ No temporary files to clean up');
        }

    } catch (error) {
        console.error('❌ Error during cleanup:', error.message);
    }
}

// Run cleanup on server start
cleanupTempFiles();

// Schedule periodic cleanup every 15 minutes
setInterval(() => {
    console.log('🔄 Running periodic cleanup...');
    cleanupTempFiles();
}, 15 * 60 * 1000); // 15 minutes

// Middleware
app.use(cors());
app.use(express.json());

// Configure multer for file uploads
const upload = multer({
    dest: 'uploads/',
    limits: {
        fileSize: 10 * 1024 * 1024, // 10MB limit
    },
    fileFilter: (req, file, cb) => {
        const allowedTypes = ['image/jpeg', 'image/png', 'image/webp', 'application/pdf'];
        if (allowedTypes.includes(file.mimetype)) {
            cb(null, true);
        } else {
            cb(new Error('Invalid file type. Only JPEG, PNG, WebP, and PDF are allowed.'));
        }
    }
});

// PDF to image conversion using pdf-poppler
async function convertPdfToImages(pdfPath) {
    try {
        const outputDir = path.dirname(pdfPath);

        console.log('Converting PDF to images using pdf-poppler...');

        const options = {
            format: 'png',
            out_dir: outputDir,
            out_prefix: 'page',
            page: null, // Convert all pages
            scale: 2048 // High resolution for better OCR
        };

        console.log('PDF conversion options:', options);

        // Convert PDF to images
        const results = await pdf.convert(pdfPath, options);

        console.log('PDF conversion results:', results);

        // Build image paths based on the results
        const imagePaths = [];
        if (Array.isArray(results)) {
            for (let i = 0; i < results.length; i++) {
                const imagePath = path.join(outputDir, `page-${i + 1}.png`);
                if (fs.existsSync(imagePath)) {
                    imagePaths.push(imagePath);
                    console.log(`Found converted page: ${imagePath}`);
                }
            }
        }

        // If no images found with the expected naming, try alternative naming
        if (imagePaths.length === 0) {
            console.log('Trying alternative file naming patterns...');
            const files = fs.readdirSync(outputDir);
            const pngFiles = files.filter(file => file.endsWith('.png') && file.startsWith('page'));

            for (const file of pngFiles) {
                const fullPath = path.join(outputDir, file);
                imagePaths.push(fullPath);
                console.log(`Found PNG file: ${fullPath}`);
            }
        }

        console.log(`Successfully converted ${imagePaths.length} pages to images`);
        return imagePaths;
    } catch (error) {
        console.error('PDF conversion error:', error);
        throw new Error(`PDF conversion failed: ${error.message}`);
    }
}

// Enhanced image preprocessing
async function enhanceImageForOCR(imagePath) {
    try {
        // Create a unique enhanced filename to avoid input/output conflict
        const enhancedPath = imagePath + '_enhanced.png';

        await sharp(imagePath)
            .resize(null, 2000, {
                withoutEnlargement: false,
                kernel: sharp.kernel.lanczos3
            })
            .normalize()
            .sharpen({ sigma: 1.2, flat: 1, jagged: 2 })
            .gamma(1.1)
            .png({ quality: 95, compressionLevel: 6 })
            .toFile(enhancedPath);

        return enhancedPath;
    } catch (error) {
        console.error('Image enhancement error:', error);
        return imagePath; // Return original if enhancement fails
    }
}

// Gemini OCR processing with intelligent formatting
async function processWithGemini(imagePath, apiKey, mode = 'standard') {
    try {
        const genAI = new GoogleGenerativeAI(apiKey);

        // Choose model based on mode (using 2.5 models as specified)
        const modelName = mode === 'structured' ? 'gemini-2.5-flash' : 'gemini-2.5-pro';
        const model = genAI.getGenerativeModel({ model: modelName });

        // Read and prepare image
        const imageBuffer = fs.readFileSync(imagePath);
        const imageBase64 = imageBuffer.toString('base64');

        const imagePart = {
            inlineData: {
                data: imageBase64,
                mimeType: 'image/png'
            }
        };

        // Choose prompt based on mode
        let prompt;
        if (mode === 'structured') {
            prompt = `🇹🇭 **THAI-FOCUSED MARKDOWN OCR - อ่านข้อความไทยและจัดรูปแบบเป็น Markdown**



**คำสั่งสำคัญ (CRITICAL INSTRUCTIONS):**

1. **อ่านทุกตัวอักษร** - แยกข้อความที่เขียนในรูปภาพทุกตัว

2. **ไม่ใช้ตัวอย่างทั่วไป** - ห้ามใช้ "Field | Value" ให้ใช้ข้อความจริงที่เห็น

3. **รักษาภาษาเดิม** - ถ้าเป็นไทยให้เป็นไทย ถ้าเป็นอังกฤษให้เป็นอังกฤษ

4. **ข้อความจริงเท่านั้น** - ห้ามตีความ แปล หรือสร้างคำอธิบายเอง



**วิธีการสแกน:**

- เริ่มจากซ้ายบน อ่านทีละบรรทัดไปขวาล่าง

- รวมข้อความทั้งหมด: หัวข้อ เนื้อหา ตัวเลข วันที่ ข้อความเล็ก

- **สำหรับข้อความไทย: เว้นวรรคระหว่างคำให้ถูกต้อง**

- สำหรับตาราง: ใช้หัวตารางและข้อมูลจริงที่เห็น

- สำหรับรายการ: ใช้รายการจริงที่เห็น



**กฎการจัดรูปแบบ MARKDOWN:**

- ## สำหรับหัวข้อใหญ่ (ใช้หัวข้อจริง)

- ### สำหรับหัวข้อย่อย (ใช้หัวข้อจริง)

- **ใช้ตารางเสมอสำหรับข้อมูลที่เป็นระบบ:**

  - เห็นตาราง → สร้างตารางด้วยหัวตารางและข้อมูลจริง

  - เห็นรายการ → แปลงเป็นตารางด้วยรายการจริง

  - เห็นคู่ข้อมูล → ใช้คีย์และค่าจริง

- ใช้ **ตัวหนา** สำหรับข้อความที่เน้นในรูป

- ใช้ > สำหรับหมายเหตุที่ปรากฏในรูป



**ตัวอย่างตาราง:**

| รายการ | จำนวน | ราคา |

|--------|--------|------|

| กาแฟ | 2 แก้ว | 60 บาท |

| ขนมปัง | 1 ชิ้น | 25 บาท |



**ผลลัพธ์: MARKDOWN ที่มีเนื้อหาจริง - ห้ามใช้แม่แบบทั่วไป**`;
        } else {
            prompt = `🇹🇭 **FAST MARKDOWN OCR - SAME FEATURES AS PRO MODE (2.5 FLASH)**



**คำสั่งสำคัญ (IDENTICAL TO STRUCTURED MODE):**

1. **อ่านทุกตัวอักษร** - แยกข้อความที่เขียนในรูปภาพทุกตัว

2. **ไม่ใช้ตัวอย่างทั่วไป** - ห้ามใช้ "Field | Value" ให้ใช้ข้อความจริงที่เห็น

3. **รักษาภาษาเดิม** - ถ้าเป็นไทยให้เป็นไทย ถ้าเป็นอังกฤษให้เป็นอังกฤษ

4. **ข้อความจริงเท่านั้น** - ห้ามตีความ แปล หรือสร้างคำอธิบายเอง



**วิธีการสแกน (SAME AS PRO MODE):**

- เริ่มจากซ้ายบน อ่านทีละบรรทัดไปขวาล่าง

- รวมข้อความทั้งหมด: หัวข้อ เนื้อหา ตัวเลข วันที่ ข้อความเล็ก

- **สำหรับข้อความไทย: เว้นวรรคระหว่างคำให้ถูกต้อง**

- สำหรับตาราง: ใช้หัวตารางและข้อมูลจริงที่เห็น

- สำหรับรายการ: ใช้รายการจริงที่เห็น



**MARKDOWN FORMATTING RULES (IDENTICAL TO PRO MODE):**

- ## สำหรับหัวข้อใหญ่ (ใช้หัวข้อจริง)

- ### สำหรับหัวข้อย่อย (ใช้หัวข้อจริง)

- **ใช้ตารางเสมอสำหรับข้อมูลที่เป็นระบบ:**

  - เห็นตาราง → สร้างตาราง markdown ด้วยหัวตารางและข้อมูลจริง

  - เห็นรายการ → แปลงเป็นตาราง markdown ด้วยรายการจริง

  - เห็นคู่ข้อมูล → ใช้คีย์และค่าจริงในตาราง markdown

- ใช้ **ตัวหนา** สำหรับข้อความที่เน้นในรูป

- ใช้ > สำหรับหมายเหตุที่ปรากฏในรูป

- ใช้ - สำหรับรายการเมื่อเหมาะสม



**การประมวลผลภาษา (SAME AS PRO MODE):**

- ข้อความไทย: เว้นวรรคให้ถูกต้อง แก้ไขข้อผิดพลาด OCR

- ข้อความอังกฤษ: รักษาการสะกดและตัวพิมพ์เดิม

- ตัวเลข: แยกตัวเลข ทศนิยม เปอร์เซ็นต์ รหัสให้แม่นยำ

- ภาษาผสม: รักษาภาษาเดิมของแต่ละส่วน



**ตัวอย่างตาราง MARKDOWN:**

| รายการ | จำนวน | ราคา |

|--------|--------|------|

| กาแฟ อเมริกาโน่ | 2 แก้ว | 120 บาท |

| ขนมปังโฮลวีท | 1 ชิ้น | 45 บาท |



**ผลลัพธ์: MARKDOWN WITH SAME FEATURES AS PRO MODE - JUST FASTER WITH 2.5 FLASH**`;
        }

        const result = await model.generateContent([prompt, imagePart]);
        const response = await result.response;
        let extractedText = response.text();

        // Both modes now support the same markdown formatting
        // No cleanup needed - both modes produce markdown output

        return extractedText;

    } catch (error) {
        console.error('Gemini processing error:', error);
        throw new Error(`OCR processing failed: ${error.message}`);
    }
}

// Generate different format outputs
function generateFormats(text, fileName, mode) {
    const baseFileName = fileName.replace(/\.[^/.]+$/, '');
    const timestamp = new Date().toISOString();

    const formats = {
        // Plain text
        txt: text,

        // Markdown (always available for both modes)
        md: mode === 'structured' ? text : `# ${baseFileName}\n\n${text}`,

        // JSON format
        json: {
            metadata: {
                fileName: fileName,
                extractedAt: timestamp,
                characterCount: text.length,
                lineCount: text.split('\n').length,
                wordCount: text.split(/\s+/).filter(word => word.length > 0).length,
                processingMode: mode
            },
            content: {
                rawText: text,
                lines: text.split('\n'),
                paragraphs: text.split('\n\n').filter(p => p.trim().length > 0)
            }
        }
    };

    return formats;
}

// Progress tracking
const progressTracking = new Map();
const consoleLogs = new Map(); // Store console logs per session

// Progress endpoint
app.get('/api/progress/:sessionId', (req, res) => {
    const sessionId = req.params.sessionId;
    const progress = progressTracking.get(sessionId) || { current: 0, total: 0, status: 'Not started' };
    res.json(progress);
});

// Latest progress endpoint (gets the most recent progress)
app.get('/api/progress-latest', (req, res) => {
    if (progressTracking.size === 0) {
        return res.json({ current: 0, total: 0, status: 'No active processing' });
    }

    // Get the most recent progress entry
    const entries = Array.from(progressTracking.entries());
    const latestEntry = entries[entries.length - 1];
    // Sending progress data to frontend
    res.json(latestEntry[1]);
});

// Main OCR endpoint
app.post('/api/ocr', upload.single('file'), async (req, res) => {
    const sessionId = Date.now().toString();

    try {
        const { mode = 'standard' } = req.body;

        // Use server-side API key if available, otherwise require from client
        const apiKey = GEMINI_API_KEY || req.body.apiKey;

        if (!apiKey) {
            return res.status(400).json({
                success: false,
                error: GEMINI_API_KEY ? 'Server API key not configured' : 'Google API Key is required'
            });
        }

        if (!req.file) {
            return res.status(400).json({
                success: false,
                error: 'No file uploaded'
            });
        }

        let extractedText = '';
        let imagePaths = [];

        // Console log capture helper
        const addConsoleLog = (message) => {
            if (!consoleLogs.has(sessionId)) {
                consoleLogs.set(sessionId, []);
            }
            const logs = consoleLogs.get(sessionId);
            logs.push({
                timestamp: new Date().toISOString(),
                message: message
            });
            // Keep only last 20 logs to prevent memory issues
            if (logs.length > 20) {
                logs.shift();
            }
            console.log(message);
        };

        addConsoleLog(`🚀 Processing file: ${req.file.originalname} in ${mode} mode`);

        // Progress update helper
        const updateProgress = (current, total, status, details = {}) => {
            const progressData = {
                current,
                total,
                status,
                sessionId,
                fileName: req.file.originalname,
                consoleLogs: consoleLogs.get(sessionId) || [],
                ...details
            };
            progressTracking.set(sessionId, progressData);
            addConsoleLog(`Progress: ${current}/${total} - ${status}`);
        };

        // Handle PDF files
        if (req.file.mimetype === 'application/pdf') {
            addConsoleLog('🔄 Starting PDF processing...');
            addConsoleLog(`📄 PDF file: ${req.file.originalname}`);
            addConsoleLog(`📊 File size: ${(req.file.size / 1024).toFixed(2)} KB`);

            updateProgress(1, 10, '🖼️ Converting PDF to images...');
            const pdfImagePaths = await convertPdfToImages(req.file.path);

            if (pdfImagePaths.length === 0) {
                throw new Error('No pages could be extracted from PDF. The PDF might be corrupted or empty.');
            }

            addConsoleLog(`✅ Converted ${pdfImagePaths.length} pages to images`);

            // Calculate total steps: 2 initial + (2 steps per page)
            const totalSteps = 2 + (pdfImagePaths.length * 2);
            updateProgress(2, totalSteps, `📋 Found ${pdfImagePaths.length} pages to process`, {
                totalPages: pdfImagePaths.length,
                currentPage: 0,
                totalCharacters: 0
            });

            // Process each page with OCR
            for (let i = 0; i < pdfImagePaths.length; i++) {
                const currentPage = i + 1;
                const totalPages = pdfImagePaths.length;

                addConsoleLog(`🔍 Processing page ${currentPage}/${totalPages} (${Math.round((currentPage / totalPages) * 100)}%)`);

                // Update progress for enhancement step
                const enhanceStep = 2 + (i * 2) + 1;
                updateProgress(enhanceStep, totalSteps, `🔧 Enhancing page ${currentPage}/${totalPages}`, {
                    totalPages,
                    currentPage,
                    totalCharacters: extractedText.length,
                    phase: 'enhancing'
                });

                addConsoleLog(`📝 Enhancing image: page-${currentPage}.png`);
                const enhancedImagePath = await enhanceImageForOCR(pdfImagePaths[i]);

                // Update progress for OCR step
                const ocrStep = 2 + (i * 2) + 2;
                updateProgress(ocrStep, totalSteps, `🤖 Running OCR on page ${currentPage}/${totalPages}`, {
                    totalPages,
                    currentPage,
                    totalCharacters: extractedText.length,
                    phase: 'ocr'
                });

                addConsoleLog(`🤖 Running OCR on page ${currentPage} with Gemini ${mode === 'structured' ? '2.5-Pro' : '2.5-Flash'}`);
                const pageText = await processWithGemini(enhancedImagePath, apiKey, mode);

                addConsoleLog(`✅ Page ${currentPage} processed - ${pageText.length} characters extracted`);

                if (mode === 'structured') {
                    extractedText += `\n\n## Page ${currentPage}\n\n${pageText}`;
                } else {
                    extractedText += `\n\n--- Page ${currentPage} ---\n\n${pageText}`;
                }

                // Update progress with completed page info
                updateProgress(ocrStep, totalSteps, `✅ Page ${currentPage}/${totalPages} completed - ${pageText.length} chars`, {
                    totalPages,
                    currentPage,
                    totalCharacters: extractedText.length,
                    pageCharacters: pageText.length,
                    phase: 'completed'
                });

                addConsoleLog(`📊 Total extracted so far: ${extractedText.length} characters`);

                imagePaths.push(enhancedImagePath);
            }

            addConsoleLog('🧹 Cleaning up temporary files...');
            // Clean up PDF image files
            pdfImagePaths.forEach((imagePath, index) => {
                try {
                    if (fs.existsSync(imagePath)) {
                        fs.unlinkSync(imagePath);
                        addConsoleLog(`🗑️  Cleaned up page-${index + 1}.png`);
                    }
                } catch (error) {
                    addConsoleLog(`⚠️  Cleanup warning: ${error.message}`);
                }
            });

        } else {
            // Handle regular image files
            addConsoleLog('🖼️  Processing single image file...');
            addConsoleLog(`📄 File: ${req.file.originalname}`);
            addConsoleLog(`📊 Size: ${(req.file.size / 1024).toFixed(2)} KB`);

            updateProgress(1, 3, '🔧 Enhancing image for better OCR...', {
                totalPages: 1,
                currentPage: 1,
                totalCharacters: 0,
                phase: 'enhancing'
            });
            const enhancedImagePath = await enhanceImageForOCR(req.file.path);

            updateProgress(2, 3, `🤖 Running OCR with Gemini ${mode === 'structured' ? '2.5-Pro' : '2.5-Flash'}...`, {
                totalPages: 1,
                currentPage: 1,
                totalCharacters: 0,
                phase: 'ocr'
            });
            extractedText = await processWithGemini(enhancedImagePath, apiKey, mode);

            updateProgress(3, 3, `✅ OCR completed - ${extractedText.length} characters extracted`, {
                totalPages: 1,
                currentPage: 1,
                totalCharacters: extractedText.length,
                pageCharacters: extractedText.length,
                phase: 'completed'
            });
            addConsoleLog(`✅ OCR completed - ${extractedText.length} characters extracted`);
            imagePaths.push(enhancedImagePath);
        }

        // Generate all formats
        const formats = generateFormats(extractedText, req.file.originalname, mode);

        // Comprehensive cleanup of all temporary files
        const filesToCleanup = [req.file.path, ...imagePaths];
        let cleanedFiles = 0;

        filesToCleanup.forEach(filePath => {
            if (filePath && fs.existsSync(filePath)) {
                try {
                    fs.unlinkSync(filePath);
                    cleanedFiles++;
                    addConsoleLog(`🗑️  Cleaned up: ${path.basename(filePath)}`);
                } catch (error) {
                    addConsoleLog(`⚠️  Cleanup warning for ${path.basename(filePath)}: ${error.message}`);
                }
            }
        });

        addConsoleLog(`✅ Cleanup complete: ${cleanedFiles} files removed`);

        // Final progress update
        const finalProgress = progressTracking.get(sessionId);
        if (finalProgress) {
            updateProgress(finalProgress.total, finalProgress.total, '✅ Processing complete!');
        }

        // Return success response
        res.json({
            success: true,
            sessionId: sessionId,
            data: {
                fileName: req.file.originalname,
                fileSize: req.file.size,
                processingMode: mode,
                extractedText: extractedText,
                formats: formats,
                metadata: {
                    characterCount: extractedText.length,
                    wordCount: extractedText.split(/\s+/).filter(word => word.length > 0).length,
                    lineCount: extractedText.split('\n').length,
                    processedAt: new Date().toISOString()
                }
            }
        });

        // Clean up progress tracking and console logs after a delay
        setTimeout(() => {
            progressTracking.delete(sessionId);
            consoleLogs.delete(sessionId);
        }, 30000); // Clean up after 30 seconds

    } catch (error) {
        console.error('OCR processing error:', error);

        // Comprehensive cleanup on error
        const filesToCleanup = [];
        if (req.file && req.file.path) filesToCleanup.push(req.file.path);
        if (imagePaths && imagePaths.length > 0) filesToCleanup.push(...imagePaths);

        let cleanedFiles = 0;
        filesToCleanup.forEach(filePath => {
            if (filePath && fs.existsSync(filePath)) {
                try {
                    fs.unlinkSync(filePath);
                    cleanedFiles++;
                    console.log(`🗑️  Error cleanup: ${path.basename(filePath)}`);
                } catch (cleanupError) {
                    console.warn(`⚠️  Error cleanup warning for ${path.basename(filePath)}:`, cleanupError.message);
                }
            }
        });

        if (cleanedFiles > 0) {
            console.log(`✅ Error cleanup complete: ${cleanedFiles} files removed`);
        }

        res.status(500).json({
            success: false,
            error: error.message || 'Internal server error during OCR processing'
        });
    }
});

// Root endpoint
app.get('/', (req, res) => {
    res.json({
        success: true,
        message: '🌙 Luna OCR Backend API',
        version: '1.0.0',
        endpoints: {
            health: '/api/health',
            ocr: '/api/ocr (POST)'
        },
        status: 'Running',
        port: PORT,
        timestamp: new Date().toISOString()
    });
});

// Health check endpoint
app.get('/api/health', (req, res) => {
    res.json({
        success: true,
        message: 'Luna OCR Backend is running!',
        hasApiKey: !!GEMINI_API_KEY,
        requiresUserApiKey: !GEMINI_API_KEY,
        timestamp: new Date().toISOString()
    });
});

// Manual cleanup endpoint
app.post('/api/cleanup', (req, res) => {
    try {
        console.log('🧹 Manual cleanup requested...');
        cleanupTempFiles();

        res.json({
            success: true,
            message: 'Cleanup completed successfully',
            timestamp: new Date().toISOString()
        });
    } catch (error) {
        console.error('Manual cleanup error:', error);
        res.status(500).json({
            success: false,
            error: 'Cleanup failed: ' + error.message
        });
    }
});

// Error handling middleware
app.use((error, req, res, next) => {
    if (error instanceof multer.MulterError) {
        if (error.code === 'LIMIT_FILE_SIZE') {
            return res.status(400).json({
                success: false,
                error: 'File too large. Maximum size is 10MB.'
            });
        }
    }

    console.error('Unhandled error:', error);
    res.status(500).json({
        success: false,
        error: 'Internal server error'
    });
});

// Graceful shutdown cleanup
process.on('SIGINT', () => {
    console.log('\n🛑 Received SIGINT, cleaning up before shutdown...');
    cleanupTempFiles();
    console.log('� LuRna OCR Backend shutting down gracefully');
    process.exit(0);
});

process.on('SIGTERM', () => {
    console.log('\n🛑 Received SIGTERM, cleaning up before shutdown...');
    cleanupTempFiles();
    console.log('👋 Luna OCR Backend shutting down gracefully');
    process.exit(0);
});

// Start server
app.listen(PORT, () => {
    console.log(`🚀 Luna OCR Backend running on port ${PORT}`);
    console.log(`📡 Health check: http://localhost:${PORT}/api/health`);
    console.log(`🔍 OCR endpoint: http://localhost:${PORT}/api/ocr`);
    console.log(`🧹 Cleanup endpoint: http://localhost:${PORT}/api/cleanup`);
    console.log(`⏰ Automatic cleanup runs every 15 minutes`);
});

module.exports = app;