Source: routes/v1/scan-image.js

/** @module routes/v1 */

// Register router

const fs = require('fs');
const { spawn } = require('child_process');
const Jimp = require('jimp');
const TokenBucket = require('tokenbucket');
const { GoogleGenAI, Type } = require('@google/genai');

const express = require('express');
const router = express.Router();

const ScanController = require('../../controllers/ScanController.js');
const UserController = require('../../controllers/UserController.js');
const BookController = require('../../controllers/BookController.js');

const { UserFactory } = require('../../classes/users/User.js');

const { authorize, generateTempImagePath } = require('../../utils.js');
const { BookFactory } = require('../../classes/books/Book.js');

// 4000 RPM
const geminiTokenBucket = new TokenBucket({
    size: 4000,
    interval: 60000,
    tokensToAddPerInterval: 4000,
});

const ai = new GoogleGenAI({
    apiKey: process.env.GOOGLE_GENAI_API_KEY,
});

/** Route for running AI inference using the DAHLS model & scan all books */
router.post('/scan/upload', async (req, res) => {
    if (!await authorize(['inference.spine'], req, res)) {
        return;
    }

    if (!req.busboy) {
        res.status(400).send({ message: 'No file provided' });
        return;
    }

    const latitude = req.body.lat;
    const longitude = req.body.lng;

    const sController = new ScanController();

    let scan;
    if (latitude && longitude) {
        scan = await sController.insertWithLocation(req.user.id, latitude, longitude)
    } else {
        scan = await sController.insert(req.user.id);
    }

    if (!scan) {
        res.status(500).send({ success: false, message: 'Scan not created' });
        return;
    }

    const imagePath = process.env.STORAGE_PATH + `/scans/${scan.id}.jpg`;

    let fstream = fs.createWriteStream(imagePath);
    let streamClosed = false;
    req.pipe(req.busboy);
    req.busboy.on('file', (fieldname, file, filename) => {
        file.pipe(fstream);
        file.on('close', () => {
            streamClosed = true;
        });
    });

    await new Promise(resolve => {
        fstream.on('close', resolve);
    });

    while (!streamClosed) {
        await new Promise(resolve => setTimeout(resolve, 100));
    }

    await sController.changeImagePath(scan.id, imagePath);

    const segmentData = await segmentImage(imagePath);
    if (!segmentData) {
        res.status(500).send({ success: false, message: 'Error segmenting image' });

        await sController.delete(scan.id);
        fs.unlinkSync(imagePath);

        return;
    }
    if (segmentData.length === 0) {
        res.status(404).send({ success: false, message: 'No spines found' });
        return;
    }

    startScan(scan.id, req.user.id, imagePath, segmentData);

    const amountOfSpines = segmentData.length;
    await sController.changeSpinesSegmentedBy(scan.id, amountOfSpines);

    const estimatedCompletionTime = (6 + 0.2 * amountOfSpines) * 1000; // 6 seconds for inference + 0.1 seconds per image

    res.status(200).send({
        success: true,
        scan_id: scan.id,
        books: segmentData.length,
        estimated_completion: estimatedCompletionTime
    });
});

/** Route for getting scan results */
router.get('/scan/:scan_id', async (req, res) => {
    if (!await authorize(['inference.spine'], req, res)) {
        return;
    }

    const scanId = req.params.scan_id;
    if (!scanId) {
        res.status(400).send({ success: false, message: 'Missing scan ID' });
        return;
    }

    const sController = new ScanController();
    const scan = await sController.byId(scanId);
    if (!scan) {
        res.status(400).send({ success: false, message: 'Invalid scan ID' });
        return;
    }
    if (scan.user_id !== req.user.id) {
        res.status(403).send({ success: false, message: 'Insufficient permissions' });
        return;
    }

    const results = await sController.getResultsAndData(scanId);

    res.status(200).send({
        success: true,
        scan_id: scan.id,
        status: scan.status,
        created_at: scan.created_at,
        location: {
            lat: scan.latitude,
            lng: scan.longitude,
        },
        amount_of_books_identified: scan.amount_of_books_identified,
        amount_of_spines_segmented: scan.amount_of_spines_segmented,
        books: results.map(book => ({
            id: book.id,
            title: book.title,
            subtitle: book.subtitle,
            segment_area: JSON.parse(book.segment).map(point => ({
                x: point[0],
                y: point[1],
            })),
            authors: book.authors.map(author => ({
                id: author.id,
                name: author.name,
                personal_name: author.personal_name,
            })),
        })),
    });
});

/** Route for deleting a scan */
router.post('/scan/delete', async (req, res) => {
    if (!await authorize(['inference.spine'], req, res)) {
        return;
    }

    const scanId = req.body.scan_id;
    if (!scanId) {
        res.status(400).send({ message: 'Missing scan_id' });
        return;
    }

    const sController = new ScanController();
    const scan = await sController.byId(scanId);
    if (!scan) {
        res.status(400).send({ message: 'Invalid scan_id' });
        return;
    }

    if (scan.user_id !== req.user.id) {
        res.status(403).send({ message: 'Insufficient permissions' });
        return;
    }

    await sController.delete(scanId);
    res.send({ message: 'Scan deleted' });
});

module.exports = router;

// Functions


/**
 * Start the scan process
 * 
 * @param {number} scanId
 * @param {number} userId
 * @param {string} imagePath
 * @return {Promise<void>}
 */
async function startScan(scanId, userId, imagePath, segmentData) {
    await scanSpines(scanId, segmentData, imagePath);

    const uController = new UserController();
    const user = await new UserFactory().load(await uController.byId(userId)).create();
    await user.changeGoalProgressByTrackName('Scanner', 'bookshelves scanned', 1);
}

/**
 * Run the DAHLS model on the image
 * 
 * @param {string} imagePath
 * @returns {Promise<any>}
 */
async function segmentImage(imagePath) {
    const image = fs.readFileSync(imagePath, {
        encoding: 'base64',
    });

    const response = await fetch(
        'https://serverless.roboflow.com/dahl-s-book-spine-detection/4?api_key=' + process.env.ROBOFLOW_API_KEY,
        {
            method: 'POST',
            headers: {
                'Content-Type': 'application/x-www-form-urlencoded',
            },
            body: image,
        }
    );

    const data = await response.json();

    return data.predictions;
}

async function scanSpines(scanId, segmentData, imagePath) {
    const originalImage = await Jimp.read(imagePath);

    const bController = new BookController();
    const sController = new ScanController();

    let totalSpines = segmentData.length;
    let completedSpines = 0;

    await sController.updateStatus(scanId, 'processing');

    const searchCache = new Map();

    for (const segment of segmentData) {
        const originalImageCopy = originalImage.clone();

        let minX = Math.min(...segment.points.map(p => p.x));
        let maxX = Math.max(...segment.points.map(p => p.x));
        let minY = Math.min(...segment.points.map(p => p.y));
        let maxY = Math.max(...segment.points.map(p => p.y));

        const width = maxX - minX + 1;
        const height = maxY - minY + 1;

        const mask = new Jimp(originalImageCopy.bitmap.width, originalImageCopy.bitmap.height, 0x000000FF);

        const polygon = segment.points.map(p => [p.x, p.y]);
        mask.scan(0, 0, mask.bitmap.width, mask.bitmap.height, function (x, y, idx) {
            if (pointInPolygon(x, y, polygon)) {
                this.bitmap.data[idx + 0] = 255; // R
                this.bitmap.data[idx + 1] = 255; // G
                this.bitmap.data[idx + 2] = 255; // B
                this.bitmap.data[idx + 3] = 255; // A
            }
        });

        originalImageCopy.mask(mask, 0, 0);

        const croppedImage = originalImageCopy.crop(minX, minY, width, height);
        await croppedImage.writeAsync('./storage/temp/test.jpg');
        const croppedImageB64 = await croppedImage.getBase64Async(Jimp.MIME_JPEG);

        await geminiTokenBucket.removeTokens(1);

        const contents = [
            {
                role: 'user',
                parts: [
                    {
                        inlineData: {
                            mimeType: "image/jpeg",
                            data: croppedImageB64.replace('data:image/jpeg;base64,', ''),
                        },
                    },
                    {
                        type: 'text',
                        text: "What is the book shown in the image?",
                    },
                ],
            }
        ]

        const config = {
            responseMimeType: 'application/json',
            responseSchema: {
                type: Type.OBJECT,
                required: ["title", "author"],
                properties: {
                    title: {
                        type: Type.STRING,
                    },
                    author: {
                        type: Type.STRING,
                    },
                },
            },
            systemInstruction: [
                {
                    text: `If no author name is present, infer the author from your knowledge. If nothing is present, output "null".`,
                }
            ],
        };

        ai.models.generateContent({
            model: 'gemini-2.0-flash-lite',
            config,
            contents,
        }).then(async data => {
            try {
                const result = JSON.parse(data.candidates[0].content.parts[0].text);

                const title = result.title != 'null' ? result.title : null;
                const author = result.author != 'null' ? result.author : null;

                if (!title) {
                    completedSpines++;
                    return null;
                }

                let bookSearchResults;

                if (searchCache[title.toLowerCase() + '-' + (author || 'null').toLowerCase()]) {
                    bookSearchResults = searchCache[title.toLowerCase() + '-' + author.toLowerCase()];
                } else if (title && author) {
                    bookSearchResults = await bController.searchByTitleAndAuthorName(title, author || '');
                    bookSearchResults.forEach(book => { book.similarity = stringSimilarity(book.title, title); });
                    bookSearchResults.sort((a, b) => b.similarity - a.similarity);

                    searchCache[title.toLowerCase() + '-' + author.toLowerCase()] = bookSearchResults;
                } else {
                    bookSearchResults = await bController.searchByTitleNatural(title, 100);
                    bookSearchResults.forEach(book => { book.similarity = stringSimilarity(book.title, title); });
                    bookSearchResults.sort((a, b) => b.similarity - a.similarity);
                    bookSearchResults = bookSearchResults.filter(book => book.similarity > 0.5);

                    searchCache[title.toLowerCase() + '-' + (author || 'null').toLowerCase()] = bookSearchResults;
                }

                if (bookSearchResults.length == 0) {
                    completedSpines++;
                    return null;
                }

                const bestMatch = bookSearchResults[0];
                await sController.insertResult(scanId, bestMatch.id, segment.points.map(p => [Math.round(p.x), Math.round(p.y)]));
                await sController.changeBooksIdentified(scanId, 1);

                completedSpines++;

                return data;
            } catch (e) {
                console.error('Error parsing response:', e);
                completedSpines++;
                return null;
            }
        }).catch(error => {
            console.error('Error:', error);
        });
    }

    while (completedSpines < totalSpines) {
        await new Promise(resolve => setTimeout(resolve, 100));
    }

    await sController.updateStatus(scanId, 'completed');
}

function pointInPolygon(x, y, polygon) {
    let inside = false;
    for (let i = 0, j = polygon.length - 1; i < polygon.length; j = i++) {
        const xi = polygon[i][0], yi = polygon[i][1];
        const xj = polygon[j][0], yj = polygon[j][1];

        const intersect = ((yi > y) !== (yj > y)) &&
            (x < (xj - xi) * (y - yi) / (yj - yi + 0.00001) + xi);
        if (intersect) inside = !inside;
    }
    return inside;
}

/**
 * Calculate similarity between two strings
 * @param {string} str1 First string to match
 * @param {string} str2 Second string to match
 * @param {number} [substringLength=2] Optional. Length of substring to be used in calculating similarity. Default 2.
 * @param {boolean} [caseSensitive=false] Optional. Whether you want to consider case in string matching. Default false;
 * @returns Number between 0 and 1, with 0 being a low match score.
 */
const stringSimilarity = (str1, str2, substringLength = 2, caseSensitive = false) => {
    if (!caseSensitive) {
        str1 = str1.toLowerCase();
        str2 = str2.toLowerCase();
    }

    if (str1.length < substringLength || str2.length < substringLength)
        return 0;

    const map = new Map();
    for (let i = 0; i < str1.length - (substringLength - 1); i++) {
        const substr1 = str1.substr(i, substringLength);
        map.set(substr1, map.has(substr1) ? map.get(substr1) + 1 : 1);
    }

    let match = 0;
    for (let j = 0; j < str2.length - (substringLength - 1); j++) {
        const substr2 = str2.substr(j, substringLength);
        const count = map.has(substr2) ? map.get(substr2) : 0;
        if (count > 0) {
            map.set(substr2, count - 1);
            match++;
        }
    }

    return (match * 2) / (str1.length + str2.length - ((substringLength - 1) * 2));
};