The loadFinished signal only indicates that the page has been loaded but after that more DOM elements can be created, and that is the case of the element with id "DataTables_Table_0" which is created moments after the page is loaded.
A possible solution is to inject a script that checks if the element exists, and that notifies so that the HTML is obtained.
import sys
from functools import cached_property
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel
from pprint import pprint
import bs4 as bs
def get_webchannel_source():
file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
if not file.open(QtCore.QIODevice.ReadOnly):
return ""
content = file.readAll()
file.close()
return content.data().decode()
class Manager(QtCore.QObject):
def __init__(self, *, offline=True, visible=False, parent=None):
super().__init__(parent)
self._html = ""
self._is_finished = False
self.app
self._profile = (
QtWebEngineWidgets.QWebEngineProfile()
if offline
else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()
)
self.view.resize(640, 480)
if not visible:
self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
self.view.show()
self.webchannel.registerObject("manager", self)
self.view.page().setWebChannel(self.webchannel)
@cached_property
def app(self):
return QtWidgets.QApplication(sys.argv)
@property
def profile(self):
return self._profile
@cached_property
def view(self):
view = QtWebEngineWidgets.QWebEngineView()
page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)
view.setPage(page)
return view
@cached_property
def webchannel(self):
return QtWebChannel.QWebChannel(self)
@property
def html(self):
return self._html
def set_script(self, script):
qscript = QtWebEngineWidgets.QWebEngineScript()
qscript.setName("qscript")
qscript.setSourceCode(get_webchannel_source() + "\n" + script)
qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)
qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)
self.profile.scripts().insert(qscript)
def start(self, url):
self.view.load(QtCore.QUrl.fromUserInput(url))
self.app.exec_()
@QtCore.pyqtSlot()
def save_html(self):
if not self._is_finished:
self.view.page().toHtml(self.html_callable)
self._is_finished = True
def html_callable(self, html):
self._html = html
self.app.quit()
JS = """
var manager = null;
function find_element() {
var e = document.getElementById('DataTables_Table_0');
console.log("try verify", e, manager);
if (e != null && manager != null) {
console.log(e)
manager.save_html()
} else {
setTimeout(find_element, 100);
}
}
(function wait_qt() {
if (typeof qt != 'undefined') {
console.log("Qt loaded");
new QWebChannel(qt.webChannelTransport, function (channel) {
manager = channel.objects.manager;
find_element();
});
} else {
setTimeout(wait_qt, 100);
}
})();
"""
def main():
manager = Manager()
manager.set_script(JS)
manager.start(
"https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"
)
soup = bs.BeautifulSoup(manager.html, "html.parser")
section = soup.find("table", {"id": "DataTables_Table_0"})
pprint(section)
if __name__ == "__main__":
main()
self.html. When you do this, you will see that theDataTables_Table_0element is missing in the output. @antont There is no problem in loading the HTML, as far as I can see.'body', {'id': 'ibm-com'}, you will see that you will get successful results. (I chose this myself after printing theself.html.) Even if you get the html with urllib, the result does not change. So I don't think the problem is in the code.