Assuming you have a DOM tree with nested tags, I would like to clean the DOM object up by removing duplicates. However, this should only apply if the tag only has a single child tag of the same type. For example,
Fix <div><div>1</div></div> and not <div><div>1</div><div>2</div></div>.
I'm trying to figure out how I could do this using PHP's DOM extension. Below is the starting code and I'm looking for help figuring out the logic needed.
<?php
libxml_use_internal_errors(TRUE);
$html = '<div><div><div><p>Some text here</p></div></div></div>';
$dom = new DOMDocument;
$dom->preserveWhiteSpace = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
function dom_remove_duplicate_nodes($node)
{
var_dump($node);
if($node->hasChildNodes())
{
for($i = 0; $i < $node->childNodes->length; $i++)
{
$child = $node->childNodes->item($i);
dom_remove_duplicate_nodes($child);
}
}
else
{
// Process here?
}
}
dom_remove_duplicate_nodes($dom);
I collected some helper functions that might make it easier to work the DOM nodes like JavaScript.
function DOM_delete_node($node)
{
DOM_delete_children($node);
return $node->parentNode->removeChild($node);
}
function DOM_delete_children($node)
{
while (isset($node->firstChild))
{
DOM_delete_children($node->firstChild);
$node->removeChild($node->firstChild);
}
}
function DOM_dump_child_nodes($node)
{
$output = '';
$owner_document = $node->ownerDocument;
foreach ($node->childNodes as $el)
{
$output .= $owner_document->saveHTML($el);
}
return $output;
}
function DOM_dump_node($node)
{
if($node->ownerDocument)
{
return $node->ownerDocument->saveHTML($node);
}
}