Revision: 9115
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at October 22, 2008 03:15 by liuran
Initial Code
Class HTML_Parser { // Private properties var $_parser; var $_tags = array(); var $_html; var $output = array(); var $strXmlData; var $_level = 0; var $_outline; var $_tagcount = array(); var $xml_error = false; var $xml_error_code; var $xml_error_string; var $xml_error_line_number; function get_html () { return $this->_html; } function parse($strInputXML) { $this->output = array(); // Translate entities $strInputXML = $this->translate_entities($strInputXML); $this->_parser = xml_parser_create (); xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true); xml_set_object($this->_parser,$this); xml_set_element_handler($this->_parser, "tagOpen", "tagClosed"); xml_set_character_data_handler($this->_parser, "tagData"); $this->strXmlData = xml_parse($this->_parser,$strInputXML ); if (!$this->strXmlData) { $this->xml_error = true; $this->xml_error_code = xml_get_error_code($this->_parser); $this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser)); $this->xml_error_line_number = xml_get_current_line_number($this->_parser); return false; } return $this->output; } function tagOpen($parser, $name, $attr) { // Increase level $this->_level++; // Create tag: $newtag = $this->create_tag($name, $attr); // Build tag $tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level); // Add tag array_push ($this->output, $tag); // Add tag to this level $this->_tags[$this->_level] = $tag; // Add to HTML $this->_html .= $newtag; // Add to outline $this->_outline .= $this->_level . $newtag; } function create_tag ($name, $attr) { // Create tag: # Begin with name $tag = '<' . strtolower($name) . ' '; # Create attribute list foreach ($attr as $key=>$val) { $tag .= strtolower($key) . '="' . htmlentities($val) . '" '; } # Finish tag $tag = trim($tag); switch(strtolower($name)) { case 'br': case 'input': $tag .= ' /'; break; } $tag .= '>'; return $tag; } function tagData($parser, $tagData) { if(trim($tagData)) { if(isset($this->output[count($this->output)-1]['tagData'])) { $this->output[count($this->output)-1]['tagData'] .= $tagData; } else { $this->output[count($this->output)-1]['tagData'] = $tagData; } } $this->_html .= htmlentities($tagData); $this->_outline .= htmlentities($tagData); } function tagClosed($parser, $name) { // Add to HTML and outline switch (strtolower($name)) { case 'br': case 'input': break; default: $this->_outline .= $this->_level . '</' . strtolower($name) . '>'; $this->_html .= '</' . strtolower($name) . '>'; } // Get tag that belongs to this end $tag = $this->_tags[$this->_level]; $tag = $this->create_tag($tag['name'], $tag['attr']); // Try to get innerHTML $regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is'; preg_match ($regex, $this->_outline, $matches); // Get innerHTML if (isset($matches['1'])) { $innerhtml = $matches['1']; } // Remove level identifiers $this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline); $this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline); // Add innerHTML if (isset($innerhtml)) { $this->output[count($this->output)-1]['innerhtml'] = $innerhtml; } // Fix tree $this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1]; array_pop($this->output); // Decrease level $this->_level--; } function translate_entities($xmlSource, $reverse =FALSE) { static $literal2NumericEntity; if (empty($literal2NumericEntity)) { $transTbl = get_html_translation_table(HTML_ENTITIES); foreach ($transTbl as $char => $entity) { if (strpos('&"<>', $char) !== FALSE) continue; $literal2NumericEntity[$entity] = '&#'.ord($char).';'; } } if ($reverse) { return strtr($xmlSource, array_flip($literal2NumericEntity)); } else { return strtr($xmlSource, $literal2NumericEntity); } } }
Initial URL
Initial Description
Initial Title
php html Parser
Initial Tags
html
Initial Language
PHP