Convert pdf to html files using node.js and pdf.js

Question

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to get those html pages rendered by browser in backend thus converting a pdf of n pages to n number of html files. I am using node.js as backend. I have tried pdf2html and other similar npm modules, they don't work great and have issues with some pdfs. Thank you for suggestions.

pdf.js convert pdf to image (canvas, png etc. ). It won't convert PDF to HTML. — shaochuancs
– shaochuancs, Commented Jun 10, 2019 at 15:05

Jan · Accepted Answer · 2023-11-17 08:40:26Z

Maybe I found something similar - I am working with local PDF file and browser. I made small changes in ready made viewer.js / PDF.js, it should be possible to process using both Node.js & browser.

This script include's PDF specified by argument to viewer.js Webpack and start browser.

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer.js');
    let wp = fs.readFileSync(viewerJSpath, 'utf-8');
    const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath, wp, 'utf-8');
    const c = path.join(__dirname, 'viewer.html');
    chp.execSync(c);
});

Then tried to add original width as next style parameter to renderTextLayer's appendText method and elements sort by position to TextLayerBuilder's render method next2 this.textLayerDiv.appendChild(textLayerFrag);.

~~All mentioned PDF.js changes on my Github~~ it seems only web and build folders are required (except npm i -g datauri fox example).

Readability improvements ? #12512 PR

Console script / bookmarklet

Using puppeteer and slightly modified PDF.js it is possible to convert directly (works both head/less, but element sizes slightly differ)

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();
});

Fixes required for puppeteer/chromium:

const message = exception?.message; // => exception.message
page: this.pageLabel ?? this.id // => this.pageLabel || this.id

viewer.js => viewerSrc.js basic additions:

function webViewerPageRendered({
...
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    }); // generate all remaining pages
  }
}

class BaseViewer {
  constructor(options) {
    this.pageNo = []; // rendered pages array
...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
    if (this.pageNo.indexOf(val) < 0) {
      this.pageNo.push(val);
    }
    if (this.pagesCount - 1 <= this.pageNo.length) {
      window.reader(elLists); // sent result back 2 node.js
    }

And result looks like {PageNo:{ElNo:{data}, ...}, ...} and could be simply translated to web page or further processed.

{
    "1": {
        "0": {
            "x": 99.9871,
            "y": 98.0496,
            "w": 557.695,
            "h": 22,
            "text": "Trace-based Just-in-Time Type Specialization for Dynamic",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 99.9871px; top: 98.0496px; width: 557.695px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.970163);"
        },
        "1": {
            "x": 327.478,
            "y": 122.793,
            "w": 102.707,
            "h": 22,
            "text": "Languages",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 327.478px; top: 122.793px; width: 102.707px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.932262);"
        },
...
    "2": {
        "0": {
            "x": 393.677,
            "y": 90.3408,
            "w": 192.909,
            "h": 11,
            "text": "1 for (var i = 2; i < 100; ++i) {",
            "ff": "monospace",
            "fs": "11.1347px",
            "cssText": "left: 393.677px; top: 90.3408px; width: 192.909px; font-size: 11.1347px; font-family: monospace; transform: scaleX(0.875232);"
        },
        "1": {
            "x": 67.0588,
            "y": 91.7599,
            "w": 173.346,
            "h": 11,
            "text": "Hence, recording and compiling a trace",
            "ff": "sans-serif",
            "fs": "11.1347px",
            "cssText": "left: 67.0588px; top: 91.7599px; width: 173.346px; font-size: 11.1347px; font-family: sans-serif; transform: scaleX(0.895175);"
        },

Summary of changes (in original gh-pages branch):

- changes in PDF.js

  function appendText(task, geom, styles) {
...
    let left, top;
+++              , width;
...
    if (angle === 0) {
      left = tx[4];
      top = tx[5] - fontAscent;
    } else {
      left = tx[4] + fontAscent * Math.sin(angle);
      top = tx[5] - fontAscent * Math.cos(angle);
    }
+++ width = geom.width * task._viewport.transform[0];

    textDiv.style.left = `${left}px`;
    textDiv.style.top = `${top}px`;
+++ textDiv.style.width = `${width}px`;

- new nodeView.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const viewerJSpath = path.join(__dirname, './viewer');
const content = datauri(pdf);
let wp = fs.readFileSync(viewerJSpath, 'utf-8');
const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
const c = path.join(__dirname, 'viewer.ff');
chp.execSync(c);

- new openFF.bat: start node nodeView.js %1

- new pdf2sortedMergedTexts.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'compressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();

});

- changed viewer.js -> viewerSrc.js

function webViewerPageRendered({
... +++
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    });
  }
}
...
class BaseViewer {
  constructor(options) {
+++ this.pageNo = [];

...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
+++ if (this.pageNo.indexOf(val) < 0) {
+++   this.pageNo.push(val);
+++   console.log(this.pageNo);
+++ }
+++ if (this.pagesCount - 1 <= this.pageNo.length) {
+++   window.reader(elLists);
+++ }

    this._currentPageNumber = val;

  render(timeout = 0) {
...
    this.textLayerRenderTask.promise.then(() => {
      this.textLayerDiv.appendChild(textLayerFrag);
+++   this.reorder(this.textLayerDiv);

... new
  reorder(_src) {
    const src = _src.children;
    let els = [];
    const elDest = [];
    for (let j = 0; j < src.length; j++) {
        const i = src[j];
        if (i.className === 'endOfContent') continue;
        els.push({ x: parseFloat(i.style.left), y: parseFloat(i.style.top), w: parseFloat(i.style.width), h: i.offsetHeight, text: i.innerText, ff: i.style.fontFamily, fs: i.style.fontSize, cssText: i.style.cssText });
    }
    els.sort((a, b) => {
      if (Math.abs(a.y - b.y) <= 1) {
          if (Math.abs(a.x - b.x) <= 1) return 0;
          else return a.x - b.x;
      } else return a.y - b.y;
    });
    let elMin = els[0];
    for (let i = 1; i < els.length; i++) {
         if (elMin.x + elMin.w + 1 >= els[i].x &&
          Math.abs(elMin.y - els[i].y) < 1 &&
          elMin.h === els[i].h &&
          elMin.ff === els[i].ff &&
          elMin.fs === els[i].fs) {
            elMin.text += els[i].text;
            elMin.w = els[i].x + els[i].w - elMin.x;
            if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
            continue;
        }
        if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
        elMin = els[i];
    }
    if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
    els = _src;
    while (els.lastChild) els.removeChild(els.lastChild);
    const elList = [];
    if (window.elLists === undefined) window.elLists = {};
    const uqIdx = { x: [], y: [] };
    for (let i = 0; i < elDest.length; i++) {
        const o = document.createElement('DIV');
        o.innerHTML = elDest[i].text;
        o.setAttribute('style', elDest[i].cssText + 'width:' + elDest[i].w + 'px;position:absolute;');
        els.appendChild(o);
        elList.push([elDest[i].x, elDest[i].x + elDest[i].w, o, elDest[i].y, elDest[i].y + elDest[i].h, elDest[i].text]);
        if (uqIdx.x.indexOf(elDest[i].x) < 0) uqIdx.x.push(elDest[i].x);
        if (uqIdx.y.indexOf(elDest[i].y) < 0) uqIdx.y.push(elDest[i].y);
    }
    elLists[_src.parentElement.getAttribute("data-page-Number")] = Object.assign({}, elDest);
  }

- changed viewer.css

+++ input{padding:0px;border:1px solid #e0e0e0;}input:focus{background-color:red;}

Online demo here - PDF download changed to HTM download (text layer) eltomjan.github.io/JStoolsSPAdemos/pdf2htm/web/viewer.html
prefect work!! So how can I convert pdf to html using your solution? Can I do it by some command with CMD or should I create html file with some js command?
Not sure yet ;-), checked my github, there is simple Node.js script and I used it once on my blind friend's machine with some batch file as default PDF viewer for best screen reader experience. It is using puppeteer for this automation (without a bit too secure common browser). Mozilla had classic argues "The problem with something like that is most likely that in many documents it wouldn't work that well, also keeping in mind that PDF documents can use left-to-right, right-to-left, or top-to-bottom reading orders, and there's also the question of performance/maintainability of the code."
... Generally speaking, something like issue github.com/mozilla/pdf.js/issues/6269 is probably the most reasonable way forward here.
Fresh new github.com/eltomjan/ETEhomeTools/blob/master/HTM_HTA/… based on these scripts converting responsive HTML/tables to XL.

Collectives™ on Stack Overflow

Convert pdf to html files using node.js and pdf.js

1 Answer 1

5 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

5 Comments

Your Answer

Sign up or log in

Post as a guest

Related