1

Where keys are represented by element type and values are represented by #foo and .bar (spaced and ready for explode()). Is it possible, or does something exist for it?

I know that this question might incite some wrath, and I'm hoping nobody links to that post about parsing HTML, but I'm hoping it's not impossible. Thanks for the help.

Addendum: Ideally, PHP would be used, since it's the only scripting language I know.

1
  • 2
    What programming language are you talking about? Commented Apr 9, 2010 at 14:54

1 Answer 1

1

Thanks for all the help :\ This function will convert a body of html into a multidimensional array that contains attributes, classes and ids.

<?php

function htmlArrayer($raw_html){

    $match_open = '/\<(?!\/)(.+?)\>/';
    $match_closed = '/\<\/(.+?)\>/';
    $match_open_or_closed = '/(\<(\/?[^\>]+)\>)/';
    $match_scripts = '@<script[^>]*?>.*?</script>@si';
    $match_styles = '@<style[^>]*?>.*?</style>@siU';
    $match_element = '/(?<=\<\s*)[a-zA-Z](?=\s+)/';
    $match_comments = '/<!--.*?-->/si';
    $match_class = '/(?<=(class\=")).+?(?=")/';
    $match_id = '/(?<=(id\=")).+?(?=")/';

    $raw_html = preg_replace($match_scripts, '', $raw_html);
    $raw_html = preg_replace($match_styles, '', $raw_html);
    $raw_html = preg_replace($match_comments, '', $raw_html);
    $raw_html = str_replace('>', '> ', $raw_html);
    $raw_html = str_replace('<', ' <', $raw_html);
    $raw_html = str_replace('!--', '!-- ', $raw_html);
    $raw_html = preg_replace('/[ \t\r\n]/', ' ', $raw_html);
    preg_match_all($match_open_or_closed, $raw_html, $matches);
    $matches[2] = checkTags($matches[2]);   
    $html_array = htmlToArray($matches[2], 0);

    return $html_array;

}

function checkTags($htmlArray) {
    $valid_tags_array = array('html', 'body', 'div', 'span', 'applet', 'object', 'iframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'pre', 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'img', 'ins', 'kbd', 'q', 's', 'samp', 'small', 'strike', 'strong', 'sub', 'sup', 'tt', 'var', 'b', 'u', 'i', 'center', 'dl', 'dt', 'dd', 'ol', 'ul', 'li','fieldset', 'form', 'label', 'legend', 'table', 'caption', 'tbody', 'tfoot', 'thead', 'tr', 'th', 'td');

    foreach($htmlArray as $key => $element) {
        $notfound = true;
        $element = explode(' ', trim($element));

        foreach($valid_tags_array as $tag) {
            if($tag == $element[0] || '/' . $tag == $element[0]){
                $notfound = false;
                break;
            }
        }

        if($notfound != false){
            $htmlArray[$key] = 'br';
        }
    }

    return $htmlArray;
}

function htmlToArray($untiered_array, $index){
    $untiered_element = explode(' ', $untiered_array[$index]);
    if($untiered_element[0] == 'br'){
        $index++;
        $untiered_element = explode(' ', $untiered_array[$index]);
    }

    $css_string = attrToCSS($untiered_array[$index]);
    $untiered_array[$index] = $untiered_element[0] . ' ' . $css_string;

    $new_array_layer = array($untiered_array[$index]);
    $tier_check = 0;

    // Loops through every remaining element from the $index forward
    for($i = $index + 1; $untiered_array[$i] != '/' . $untiered_element[0] || $tier_check != 0; $i++){
        $one_way_elements = array('br', 'img', 'area', 'base', 'basefront', 'hr', 'input', 'link', 'meta', 'col', 'embed', 'param');
        $element_check = true;
        $next_element_name = explode(' ', $untiered_array[$i]);

        foreach($one_way_elements as $this_element){
            if($this_element == $next_element_name[0]){
                $element_check = false;
                break;
            }
        }

        // if it *is* the self-closing type, create a 1d array for it.
        if($element_check == false) {
            $tier_check++;
            if($tier_check == 1) {
                $untiered_standalone = explode(' ', $untiered_array[$i]);
                $css_string = attrToCSS($untiered_array[$i]);
                $untiered_array[$i] = $untiered_standalone[0] . ' ' . $css_string;

                $new_array_layer[] = array($untiered_array[$i]);
            }
            $tier_check--;
        }

        // If the following element is not preceded by a '/' and is not self-closing, continue 
        if((strpos($untiered_array[$i], '/') != 0 || strpos($untiered_array[$i], '/') === false) && $element_check == true){
            $tier_check++;

            // If the next element is only one tier above this element (as in its direct child), reiterate
            if($tier_check == 1){       
                $new_array_layer[] = htmlToArray($untiered_array, $i);
            }                       
        }

        // If the next element *does* begin with a closing slash
        if(strpos($untiered_array[$i], '/') === 0){
            $tier_check--;
        }
    }

    return $new_array_layer;
}

function attrToCSS($attr_string){

    preg_match_all('/(?<=(class\=")).+?(?=")/', $attr_string, $class_value);
    $class_value_string = $class_value[0][0];

    preg_match_all('/(?<=(id\=")).+?(?=")/', $attr_string, $id_value);
    $id_value_string = $id_value[0][0];

    if($class_value_string != ''){
        $class_value_array = explode(' ', $class_value_string);

        foreach($class_value_array as $index => $class) {
            $class_value_array[$index] = '.' . $class;
        }
        $class_id_string = implode(' ', $class_value_array);
    } 

    if ($id_value_string != '') {
        $class_id_string = '#' . $id_value_string;
    }

    return $class_id_string;
}


?>
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.