Powered By Blogger

Search Here!

Tuesday, October 8, 2024

Get Pdf Text Content Line Based

 

const pdfParse = require("pdf-parse"); 
* Reads the text content from a specified or latest downloaded
PDF file.
 *
 * @param {string|null} [fileName=null] - The name of the PDF file.
If null, the latest downloaded file will be used.
 * @param {number|null} [lineToCheck=null] - The line number in the
PDF content to retrieve (1-based index). If null, all lines will
be returned.
 * @returns {Promise<string|string[]>} - The text content of
the specified line or all lines if lineToCheck is not provided.
 */
async function getPdfTextContentLineBased(fileName = null,
                lineToCheck = null) {
    // Define the download directory path
    const downloadDir = CONSTANTS.FILE.FILE_PATH;

    let pdfFilePath;

    if (fileName !== null) {
        // Use the provided filename to construct the file path
        pdfFilePath = `${downloadDir}/${fileName}`;
        console.log(`Reading content from the specified file:
        ${fileName}`);
    } else {
        // Get the latest downloaded file name
        const latestDownloadedFileName =
                await this.getLatestDownloadedFileName();

        // Construct the full path of the latest downloaded PDF file
        pdfFilePath = `${downloadDir}/${latestDownloadedFileName}`;
        console.log(
            `Reading content from the latest downloaded file:
            ${latestDownloadedFileName}`
        );
    }

    // Read the PDF file
    const dataBuffer = fs.readFileSync(pdfFilePath);

    // Parse the PDF and extract text
    const data = await pdfParse(dataBuffer);

    // Split text into lines
    const lines = data.text.split("\n");

    // Print all lines of the PDF text content
    console.log(`Content of the file ${fileName ||
                    "latest downloaded PDF"}:`);
    console.table(
        lines.map((line, index) => ({ Line: index + 1,
                        Content: line.trim() }))
    );

    // Return text based on the specified line or all lines
if lineToCheck is not provided
    if (lineToCheck !== null) {
        if (lineToCheck < 1 || lineToCheck > lines.length) {
            console.warn(`Line number ${lineToCheck} is out of
                                           range.`);
            return ""; // Return an empty string if
the line number is out of range
        }
        return lines[lineToCheck - 1].trim(); // Return the
specified line
    }

    return lines.map((line) => line.trim()); // Return all lines
}

No comments:

Post a Comment