How can I efficiently parse HTML in Python?

Question

I want to parse the HTML code efficiently without an external library.

I have already tried with a for loop which checks which symbol it is:

list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
    if a == "<":
        m = 1
        list.append([])
    elif a == ">":
        m = 0
        list.append([])
    else:
        list[-1] = a
print(list)

But the code was very slow on 50KB files.

What exactly do you want to achieve? Please add some more description, like what output do you expect? Do you get any errors? Which one? Complete stacktrace! You are appending empty lists to a list? And why can't you use external libraries? — Ocaso Protal
– Ocaso Protal, Commented May 23, 2019 at 14:00
I'm not sure what you want to do, but you can parse HTML with the standard library module html.parser. — Maximouse
– Maximouse, Commented May 23, 2019 at 14:06

Noctis Skytower · Accepted Answer · 2019-05-23 21:34:51Z

May I recommend starting with a simple HTML parser like the one shown below? It uses the standard library that comes with Python and has no external dependencies. You may need to alter and extend it according to your needs, but it gives you a basic DOM API that should be a good beginning point to work from. The code works for the simple case it is meant to tackle; but depending on your needs, you may need to add further functionality to accomplish whatever your end goal may be.

#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom


def main():
    # noinspection PyPep8
    document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
'''
    parser = DocumentParser()
    parser.feed(document)
    parser.close()
    model = parser.document.documentElement
    model.normalize()
    print(model.toprettyxml())
    first_title = model.getElementsByTagName('title')[0]
    print(first_title.toxml())
    print(first_title.tagName)
    print(first_title.firstChild.data)
    print(first_title.parentNode.tagName)
    first_p = model.getElementsByTagName('p')[0]
    print(first_p.toxml())
    print(first_p.getAttribute('class'))
    all_a = model.getElementsByTagName('a')
    print(all_a[0].toxml())
    pprint.pprint([element.toxml() for element in all_a])
    pprint.pprint([element.toxml() for element in find(model, id='link3')])
    for element in all_a:
        print(element.getAttribute('href'))
    print(*get_text(model), sep='\n')


class DocumentParser(html.parser.HTMLParser):
    # noinspection SpellCheckingInspection
    def __init__(self, *, convert_charrefs=True):
        super().__init__(convert_charrefs=convert_charrefs)
        self.document = self.focus = xml.dom.minidom.DOMImplementation() \
            .createDocument(None, None, None)

    @property
    def document_has_focus(self):
        return self.document is self.focus

    def handle_starttag(self, tag, attrs):
        element = self.document.createElement(tag)
        for name, value in attrs:
            element.setAttribute(name, value)
        self.focus.appendChild(element)
        self.focus = element

    def handle_endtag(self, tag):
        while self.focus.tagName != tag:
            self.focus = self.focus.parentNode
        self.focus = self.focus.parentNode

    def handle_data(self, data):
        if not self.document_has_focus and not data.isspace():
            self.focus.appendChild(self.document.createTextNode(data.strip()))

    def error(self, message):
        raise RuntimeError(message)

    def close(self):
        super().close()
        while not self.document_has_focus:
            self.focus = self.focus.parentNode


def find(element, **kwargs):
    get_attribute = getattr(element, 'getAttribute', None)
    if get_attribute and \
            all(get_attribute(key) == value for key, value in kwargs.items()):
        yield element
    for child in element.childNodes:
        yield from find(child, **kwargs)


def get_nodes_by_type(node, node_type):
    if node.nodeType == node_type:
        yield node
    for child in node.childNodes:
        yield from get_nodes_by_type(child, node_type)


def get_text(node):
    return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))


if __name__ == '__main__':
    main()

Collectives™ on Stack Overflow

How can I efficiently parse HTML in Python?

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related