Return to Snippet

Revision: 68228
at December 16, 2014 01:13 by nigelnquande


Updated Code
function html_table_to_plain($html, $id = NULL) {
            $plain_message = '';
            $DOM = new DOMDocument();
            $doc = $DOM->loadHTML($html);
            if ($doc === FALSE) { $plain_message  = 'Failed to create DOM from HTML!'; throw new DOMException('Could not Load HTML into DOMDocument'); return $plain_message; }
            $titles_list = $DOM->getElementsByTagName('title');
            if ($titles_list->length) $plain_message = '# '. $titles_list->item(0)->nodeValue ."\n\n";
            $tables_list = $DOM->getElementsByTagName('table');
            if (is_string($id) && !empty($id)) $tables_list = $DOM->getElementById($id);
            if (!empty($tables_list)) {
                    foreach ($tables_list as $table) {
                            $plain_message .= "+==========\n";
                                    $rows = $table->getElementsByTagName('tr');
                            if (!empty($rows)) {
                                    $r = 0;
                                    $rows_arr = iterator_to_array($rows);
                                    for ($r = 0; $r < count($rows_arr); $r++) {
                                            $headers = iterator_to_array($rows_arr[$r]->getElementsByTagName('th'));
                                            $cells = iterator_to_array($rows_arr[$r]->getElementsByTagName('td'));
                                            if (count($headers) > 0) {
                                                    $plain_message .= '| ';
                                                    foreach ($headers as $th) {
                                                            $plain_message .= '__'. $th->nodeValue .'__ | ';
                                                    }
                                            }
                                            if (count ($cells) > 0) {
                                                foreach ($cells as $cell) {
                                                    $links = iterator_to_array($cell->getElementsByTagName('a'));
                                                        if (count($links)) {
                                                            foreach ($links as $l) {
                                                                $plain_message .= '[';
                                                                $imgs = iterator_to_array( $l->getElementsByTagName('img'));
                                                                if (count($imgs) > 0) {
                                                                                            
                                                                    foreach ($imgs as $img) {
                                                                        $plain_message .='!['. $img->getAttribute('alt')
                                                                            .']('. $img->getAttribute('src') .')';
                                                                    }
                                                                }
                                                                else { $plain_message .= $l->nodeValue; }
                                                                $plain_message .= ']('. $l->getAttribute('href') .')';
                                                            }
                                                        }
                                                        else $plain_message .= $cell->nodeValue;
                                                        $plain_message .= ' | ';
                                                }
                                            }
                                    
                                            if ($r < (count($rows_arr) -1)) $plain_message .= "\n+----------\n";
                                    }
                            }
                            $plain_message .= "\n+==========\n";
                    }
            }
            
            $plain_message = preg_replace('|(?mi-Us)[ ]{2,}|', ' ', $plain_message);
    
    return $plain_message ;
}

Revision: 68227
at December 16, 2014 01:11 by nigelnquande


Initial Code
function html_table_to_plain($html, $id = NULL) {
            $plain_message = '';
            $DOM = new DOMDocument();
            $doc = $DOM->loadHTML($html);
            if ($doc === FALSE) { $plain_message  = 'Failed to create DOM from HTML!'; throw new DOMException('Could not Load HTML into DOMDocument'); return $plain_message; }
            $titles_list = $DOM->getElementsByTagName('title');
            if ($titles_list->length) $plain_message = '# '. $titles_list->item(0)->nodeValue ."\n\n";
            $tables_list = $DOM->getElementsByTagName('table');
            if (is_string($id) && !empty($id)) $tables_list = $DOM->getElementById($id);
            if (!empty($tables_list)) {
                    foreach ($tables_list as $table) {
                            $plain_message .= "+==========\n";
                                    $rows = $table->getElementsByTagName('tr');
                            if (!empty($rows)) {
                                    $r = 0;
                                    $rows_arr = iterator_to_array($rows);
                                    for ($r = 0; $r < count($rows_arr); $r++) {
                                            $headers = iterator_to_array($rows_arr[$r]->getElementsByTagName('th'));
                                            $cells = iterator_to_array($rows_arr[$r]->getElementsByTagName('td'));
                                            if (count($headers) > 0) {
                                                    $plain_message .= '| ';
                                                    foreach ($headers as $th) {
                                                            $plain_message .= '__'. $th->nodeValue .'__ | ';
                                                    }
                                            }
                                            if (count ($cells) > 0) {
                                                foreach ($cells as $cell) {
                                                    $links = iterator_to_array($cell->getElementsByTagName('a'));
                                                        if (count($links)) {
                                                            foreach ($links as $l) {
                                                                $plain_message .= '[';
                                                                $imgs = iterator_to_array( $l->getElementsByTagName('img'));
                                                                if (count($imgs) > 0) {
                                                                                            
                                                                    foreach ($imgs as $img) {
                                                                        $plain_message .='!['. $img->getAttribute('alt')
                                                                            .']('. $img->getAttribute('src') .')';
                                                                    }
                                                                }
                                                                else { $plain_message .= $l->nodeValue; }
                                                                $plain_message .= ']('. $l->getAttribute('href') .')';
                                                            }
                                                        }
                                                        else $plain_message .= $cell->nodeValue;
                                                        $plain_message .= ' | ';
                                                }
                                            }
                                    
                                            if ($r < (count($rows_arr) -1)) $plain_message .= "\n+----------\n";
                                    }
                            }
                            $plain_message .= "\n+==========\n";
                    }
            }
            
            $plain_message = preg_replace('|(?mi-Us)[  ]{2,}|', ' ', $plain_message);
    
    return $plain_message ;
}

Initial URL


Initial Description
Use PHP's DOM parser to convert a table into plain text (including links with images)

Initial Title
HTML Table to plain text

Initial Tags
php, table, html, DOM, text

Initial Language
PHP