Return to Snippet

Revision: 9115
at October 22, 2008 03:15 by liuran


Initial Code
Class HTML_Parser {
    // Private properties
    var $_parser;
    var $_tags = array();
    var $_html;
    var $output = array();
    var $strXmlData;
    var $_level = 0;
    var $_outline;
    var $_tagcount = array();
    var $xml_error = false;
    var $xml_error_code;
    var $xml_error_string;
    var $xml_error_line_number;

    function get_html () {
        return $this->_html;
    }

    function parse($strInputXML) {
        $this->output = array();

        // Translate entities
        $strInputXML = $this->translate_entities($strInputXML);

        $this->_parser = xml_parser_create ();
        xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true);
        xml_set_object($this->_parser,$this);
        xml_set_element_handler($this->_parser, "tagOpen", "tagClosed");
          
        xml_set_character_data_handler($this->_parser, "tagData");
      
        $this->strXmlData = xml_parse($this->_parser,$strInputXML );

        if (!$this->strXmlData) {
            $this->xml_error = true;
            $this->xml_error_code = xml_get_error_code($this->_parser);
            $this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser));
            $this->xml_error_line_number =  xml_get_current_line_number($this->_parser);
            return false;
        }

        return $this->output;
    }


    function tagOpen($parser, $name, $attr) {
        // Increase level
        $this->_level++;

        // Create tag:
        $newtag = $this->create_tag($name, $attr);

        // Build tag
        $tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level);

        // Add tag
        array_push ($this->output, $tag);

        // Add tag to this level
        $this->_tags[$this->_level] = $tag;

        // Add to HTML
        $this->_html .= $newtag;

        // Add to outline
        $this->_outline .= $this->_level . $newtag;
    }

    function create_tag ($name, $attr) {
        // Create tag:
        # Begin with name
        $tag = '<' . strtolower($name) . ' ';

        # Create attribute list
        foreach ($attr as $key=>$val) {
            $tag .= strtolower($key) . '="' . htmlentities($val) . '" ';
        }

        # Finish tag
        $tag = trim($tag);
        
        switch(strtolower($name)) {
            case 'br':
            case 'input':
                $tag .= ' /';
            break;
        }

        $tag .= '>';

        return $tag;
    }

    function tagData($parser, $tagData) {
        if(trim($tagData)) {
            if(isset($this->output[count($this->output)-1]['tagData'])) {
                $this->output[count($this->output)-1]['tagData'] .= $tagData;
            } else {
                $this->output[count($this->output)-1]['tagData'] = $tagData;
            }
        }

        $this->_html .= htmlentities($tagData);
        $this->_outline .= htmlentities($tagData);
    }
  
    function tagClosed($parser, $name) {
        // Add to HTML and outline
        switch (strtolower($name)) {
            case 'br':
            case 'input':
                break;
            default:
            $this->_outline .= $this->_level . '</' . strtolower($name) . '>';
            $this->_html .= '</' . strtolower($name) . '>';
        }

        // Get tag that belongs to this end
        $tag = $this->_tags[$this->_level];
        $tag = $this->create_tag($tag['name'], $tag['attr']);

        // Try to get innerHTML
        $regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is';
        preg_match ($regex, $this->_outline, $matches);

        // Get innerHTML
        if (isset($matches['1'])) {
            $innerhtml = $matches['1'];
        }
        
        // Remove level identifiers
        $this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
        $this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline);

        // Add innerHTML
        if (isset($innerhtml)) {
            $this->output[count($this->output)-1]['innerhtml'] = $innerhtml;
        }

        // Fix tree
        $this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1];
        array_pop($this->output);

        // Decrease level
        $this->_level--;
    }

    function translate_entities($xmlSource, $reverse =FALSE) {
        static $literal2NumericEntity;
        
        if (empty($literal2NumericEntity)) {
            $transTbl = get_html_translation_table(HTML_ENTITIES);

            foreach ($transTbl as $char => $entity) {
                if (strpos('&#038;"<>', $char) !== FALSE) continue;
                    $literal2NumericEntity[$entity] = '&#'.ord($char).';';
                }
            }

            if ($reverse) {
                return strtr($xmlSource, array_flip($literal2NumericEntity));
            } else {
                return strtr($xmlSource, $literal2NumericEntity);
            }
      }
}

Initial URL


Initial Description


Initial Title
php html Parser

Initial Tags
html

Initial Language
PHP