I've been using puppeteer to try and get pdfs - or its buffer response - from a website which does two requests after clicking on the link for the document (which open in a new tab):
- The first request (http://epicdocs.planningni.gov.uk/ViewDocument.pa?uri=4157826&ext=PDF) retrieves the session guid to access the document
- The second request (http://epicdocs.planningni.gov.uk/ViewDocument.aspx?guid=4ecd1fe5-43c6-4202-96e3-66b393fb819c) uses that guid to access the document and render the pdf on the browser.
The result of my attempts has been a blank pdf being generated, even if it was created after the page been loaded (checked with Fiddler).
I've tried
- Intercepting
targetcreatedevent to get the page - Get the second request url and use
page.gototo get the pdf - Wait on a the page response to get the buffer
- Set
Page.setDownloadBehaviourto allow download instead of rendering it in the browser
Any guidance and help is appreciated. The code tried is below:
const puppeteer = require("puppeteer");
let browser;
async function getDocument(index, title, page) {
if (index != 19) return "";
console.log("getDocument START");
console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
let docPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ViewDocument page");
}
})
);
/* Tried to set the download behaviour to download automatically the pdf but it didn't work */
// await page._client.send("Page.setDownloadBehaviour", {
// behaviour: "allow",
// downloadPath: "./"
// });
await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
let pdfResults = "";
let pdfPage = await docPagePromise;
/* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
// let target = await pdfPage.target();
// let url = await target.url();
// let response = await pdfPage.goto(url);
// console.log(response);
pdfPage.on("console.log", msg => console.log(msg));
/* This is never called */
await pdfPage.on("response", async response => {
console.log("PDF PAGE Response");
let responseBuffer = await response.buffer();
let responseHeaders = response.headers();
console.log("PDF PAGE Response Header: " + responseHeaders);
console.log("PDF PAGE Response Buffer: " + responseBuffer);
return {
responseHeaders,
responseBuffer
};
});
console.log(pdfResults);
let pdfTitle = await pdfPage.title();
console.log("PDFPage URL: " + pdfPage.url());
console.log("PDFPage Title: " + pdfTitle);
let pdfTarget = await pdfPage.target();
console.log("PDFTarget URL: " + (await pdfTarget.url()));
console.log("PDFTarget Type: " + pdfTarget.type());
pdfPage = await pdfTarget.page();
console.log("PDFPage URL: " + pdfPage.url());
await pdfPage.waitFor(3000);
let pdf = await pdfPage.pdf({ path: title + ".pdf" });
console.log(pdf);
return pdf;
}
async function getAdditionalDocumentation(page) {
console.log("getAdditionalDocumentation START");
await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
await page.click("#repGroupSummary__ctl1_lnkGroupName");
await page.waitForSelector("#pnlDocumentList > table > tbody > tr");
await page.waitFor(2000);
const documents = await page.$$eval(
"#pnlDocumentList > table > tbody > tr",
docs =>
docs.map((doc, i) => ({
type: doc.querySelector(".tdl-subgroup > span").innerText,
datePublished: doc.querySelector(
".tdl-date > span[id*='DatePublished']"
).innerText,
dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
.innerText,
docType: doc.querySelector(".tdl-doctype > span").innerText,
description: doc.querySelector(".tdl-description > span").innerText
// 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
}))
);
for (let i = 0; i < documents.length; i++) {
documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
}
await page.click("#btnSummary");
console.log("getAdditionalDocumentation FINISH");
return documents;
}
async function getDocuments(page, browser) {
console.log("getDocuments");
let newPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ShowCaseFile page");
}
})
);
await page.click("#tab_externalDocuments > span");
await page.waitForSelector("#hp-doc-link");
await page.click("#hp-doc-link");
const newPage = await newPagePromise;
const additionalDocumentation = await getAdditionalDocumentation(newPage);
return {
additionalDocumentation
};
}
async function run() {
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
page.on("console", msg => console.log("PAGE LOG:", ...msg.args));
const planningReference = "LA04/2017/1388/F";
await page.goto(
"http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
);
await page.waitForSelector("#simpleSearchString");
await page.type("#simpleSearchString", planningReference);
await page.click("#simpleSearchForm > div.row3 > input.button.primary");
await page.waitForSelector("#simpleDetailsTable");
console.log("getDocuments START");
const documents = await getDocuments(page, browser);
console.log("getDocuments FINISH");
console.log(documents);
console.log(documents.additionalDocumentation.length);
} finally {
browser.close();
}
}
run();