Revision: 23571
Updated Code
at February 9, 2010 12:35 by tclancy
Updated Code
using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Collections; using TidyATL; using ProjectName.Core.Interfaces; namespace ProjectName.Core.Content { public class HtmlFormatter : IFormatter { private string _contents; private Hashtable _tagMap = new Hashtable(); private ArrayList _singleTags = new ArrayList(); private string _urlPlaceholder = "[** URL_ROOT **]"; public HtmlFormatter(string content) { this._contents = content; this.Setup(); } private void Setup() { // create a lookup table for tags: // key = tag // -1 = strip tag and contents inside tag completely // 0 = allow tag, no attributes // 1 = allow tag with attributes // N.B., all other tags should be stripped this._tagMap.Add("head", -1); this._tagMap.Add("select", -1); this._tagMap.Add("input", -1); this._tagMap.Add("script", -1); this._tagMap.Add("noscript", -1); this._tagMap.Add("xmp", -1); this._tagMap.Add("style", -1); this._tagMap.Add("a", 1); this._tagMap.Add("table", 1); this._tagMap.Add("tr", 1); this._tagMap.Add("th", 1); this._tagMap.Add("td", 1); this._tagMap.Add("ul", 0); this._tagMap.Add("ol", 0); this._tagMap.Add("li", 0); this._tagMap.Add("p", 1); this._tagMap.Add("xml", 1); this._tagMap.Add("img", 1); this._tagMap.Add("br", 0); this._tagMap.Add("hr", 0); this._tagMap.Add("b", 0); this._tagMap.Add("strong", 0); this._tagMap.Add("i", 0); this._tagMap.Add("u", 0); this._tagMap.Add("strike", 0); this._tagMap.Add("sup", 0); this._tagMap.Add("sub", 0); this._tagMap.Add("iframe", 1); } #region IFormatter Members public string Render() { // fix links formatMe = this.ReplaceRelativeUrlPlaceholder(formatMe); // find assets return formatMe; } public string Clean() { string formatMe = this.CleanTags(this._contents); return formatMe; } #endregion #region HTML Markup Handling private string InsertRelativeUrlPlaceholder(string input) { string formatMe = input; string searchTerm = System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]; if (formatMe.IndexOf(searchTerm) > -1) { Regex reg = new Regex(searchTerm); MatchCollection matches = reg.Matches(formatMe); foreach (Match m in matches) { formatMe = formatMe.Replace(m.ToString(), this._urlPlaceholder); } } return formatMe; } private string ReplaceRelativeUrlPlaceholder(string input) { string formatMe = input; formatMe = formatMe.Replace(this._urlPlaceholder, System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]); return formatMe; } private string TidyHTML(string input) { Tidy.Document doc = new Tidy.Document(); //doc.OnMessage += new Tidy.IDocumentEvents_OnMessageEventHandler(TidyDiagnostics); // set some options doc.SetOptBool(TidyOptionId.TidyBodyOnly, 1); doc.SetOptBool(TidyOptionId.TidyXhtmlOut, 1); doc.SetOptBool(TidyOptionId.TidyWord2000, 1); doc.SetOptValue(TidyOptionId.TidyIndentContent, "auto"); int err_code = doc.ParseString(input); if (err_code < 0) { throw new Exception("Unable to parse string: " + input); } err_code = doc.CleanAndRepair(); if (err_code < 0) { throw new Exception("Unable to clean/repair string: " + input); } //err_code = doc.RunDiagnostics(); //if (err_code < 0) //{ // throw new Exception("Unable to run diagnostics on: " + input); //} return(doc.SaveString().Trim()); } public void TidyDiagnostics(TidyATL.TidyReportLevel level, int line, int col, string message) { Console.WriteLine("Tidy diagnostic message: " + message); } private int InStrEndOfTag (string input) { bool attr = false; int pos = 0; while (pos < input.Length) { pos++; if (!attr && (input.Substring(pos, 1) == ">")) { return pos; } if (input.Substring(pos, 1) == "") { attr = !attr; } } return pos; } private string RemoveExtraTags(string input) { string temp = input; string output = ""; int pos; string tag, name; while (temp != "") { if (temp.Substring(0, 1) == "<") { pos = InStrEndOfTag(temp); if (pos == 0) { tag = temp.Substring(2); temp = ""; } else { tag = temp.Substring(1, pos - 1); temp = temp.Substring(pos + 1); } name = tag.Split(new Char[] { ' ' })[0].ToLower(); if (name.Substring(0, 1) == "/") { name = name.Substring(1); } if (this._tagMap.Contains(name)) { int val = Convert.ToInt32(this._tagMap[name].ToString()); switch (val) { case -1: pos = temp.ToLower().IndexOf("</" + name + ">"); if (pos > 0) { temp = temp.Substring(pos + name.Length + 3); } break; case 0: output += "<"; if (tag.Substring(0, 1) == "/") { output += "/"; } output += name + ">"; break; case 1: output += "<" + tag + ">"; break; default: break; } } } else { output += temp.Substring(0,1); temp = temp.Substring(1); } } return output; } private string CleanTags(string input) { // run HTML Tidy on content string formatMe = this.TidyHTML(input.Trim()); // get rid of comments first to make tag balancing a little easier formatMe = this.StripComments(formatMe); // remove attributes that are unacceptable in any case (e.g., JavaScript attributes, CSS) formatMe = this.ReplaceNastyAttributes(formatMe); formatMe = RemoveExtraTags(formatMe); // substitute placeholder for relative links formatMe = this.InsertRelativeUrlPlaceholder(formatMe); return formatMe; } private string StripComments(string input) { Regex regex = new Regex("<!--.*-->"); return regex.Replace(input, ""); } private string ReplaceNastyAttributes(string input) { Regex regex = new Regex("( on[a-z]{1,}|style|id)=[\"'](.*?)[\"']"); return regex.Replace(input, ""); } #endregion HTML Markup Handling } }
Revision: 23570
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at February 9, 2010 12:33 by tclancy
Initial Code
using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Collections; using TidyATL; using ProjectName.Core.Interfaces; namespace ProjectName.Core.Content { public class HtmlFormatter : IFormatter { private string _contents; private Hashtable _tagMap = new Hashtable(); private ArrayList _singleTags = new ArrayList(); private string _urlPlaceholder = "[** URL_ROOT **]"; public HtmlFormatter(string content) { this._contents = content; this.Setup(); } private void Setup() { // create a lookup table for tags: // key = tag // -1 = strip tag and contents inside tag completely // 0 = allow tag, no attributes // 1 = allow tag with attributes // N.B., all other tags should be stripped this._tagMap.Add("head", -1); this._tagMap.Add("select", -1); this._tagMap.Add("input", -1); this._tagMap.Add("script", -1); this._tagMap.Add("noscript", -1); this._tagMap.Add("xmp", -1); this._tagMap.Add("style", -1); this._tagMap.Add("a", 1); this._tagMap.Add("table", 1); this._tagMap.Add("tr", 1); this._tagMap.Add("th", 1); this._tagMap.Add("td", 1); this._tagMap.Add("ul", 0); this._tagMap.Add("ol", 0); this._tagMap.Add("li", 0); this._tagMap.Add("p", 1); this._tagMap.Add("xml", 1); this._tagMap.Add("img", 1); this._tagMap.Add("br", 0); this._tagMap.Add("hr", 0); this._tagMap.Add("b", 0); this._tagMap.Add("strong", 0); this._tagMap.Add("i", 0); this._tagMap.Add("u", 0); this._tagMap.Add("strike", 0); this._tagMap.Add("sup", 0); this._tagMap.Add("sub", 0); this._tagMap.Add("iframe", 1); } #region IFormatter Members public string Render() { string formatMe = this.ReplaceFormMarkup(this._contents); // fix links formatMe = this.ReplaceRelativeUrlPlaceholder(formatMe); // find assets return formatMe; } public string Clean() { string formatMe = this.CleanTags(this._contents); return formatMe; } #endregion #region HTML Markup Handling private string InsertRelativeUrlPlaceholder(string input) { string formatMe = input; string searchTerm = System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]; if (formatMe.IndexOf(searchTerm) > -1) { Regex reg = new Regex(searchTerm); MatchCollection matches = reg.Matches(formatMe); foreach (Match m in matches) { formatMe = formatMe.Replace(m.ToString(), this._urlPlaceholder); } } return formatMe; } private string ReplaceRelativeUrlPlaceholder(string input) { string formatMe = input; formatMe = formatMe.Replace(this._urlPlaceholder, System.Configuration.ConfigurationManager.AppSettings["URL_ROOT"]); return formatMe; } private string ReplaceFormMarkup(string input) { string formatMe = input; // replace forms. Looks like: <img class="formIcon" src="http://www.logicacmg.com/pSecured/admin/countries/_app/img/form_icon.gif?f=4" alt="" /> if (formatMe.IndexOf("form_icon.gif") > -1) { string formHtml = ""; string formId; Regex reg = new Regex("<img.*src=.*form_icon.gif\\?f=(\\d+).*>"); MatchCollection matches = reg.Matches(formatMe); foreach (Match m in matches) { if (m.Groups.Count > 0) { formId = m.Groups[1].ToString(); com.logicacmg.www.ProcessRequest FormService = new ProjectName.Core.com.logicacmg.www.ProcessRequest(); if (FormService.VerifyForm(formId) == "1") { string currentPostedFormData; try { if (System.Web.HttpContext.Current.Request.Form["formId"] == formId) { currentPostedFormData = System.Web.HttpContext.Current.Request.Form.ToString(); } else { currentPostedFormData = "formId=" + formId; } } catch (System.NullReferenceException) { currentPostedFormData = "formId=" + formId; } formHtml = FormService.GenerateForm(currentPostedFormData); formatMe = formatMe.Replace(m.ToString(), formHtml); } } } } return formatMe; } private string TidyHTML(string input) { Tidy.Document doc = new Tidy.Document(); //doc.OnMessage += new Tidy.IDocumentEvents_OnMessageEventHandler(TidyDiagnostics); // set some options doc.SetOptBool(TidyOptionId.TidyBodyOnly, 1); doc.SetOptBool(TidyOptionId.TidyXhtmlOut, 1); doc.SetOptBool(TidyOptionId.TidyWord2000, 1); doc.SetOptValue(TidyOptionId.TidyIndentContent, "auto"); int err_code = doc.ParseString(input); if (err_code < 0) { throw new Exception("Unable to parse string: " + input); } err_code = doc.CleanAndRepair(); if (err_code < 0) { throw new Exception("Unable to clean/repair string: " + input); } //err_code = doc.RunDiagnostics(); //if (err_code < 0) //{ // throw new Exception("Unable to run diagnostics on: " + input); //} return(doc.SaveString().Trim()); } public void TidyDiagnostics(TidyATL.TidyReportLevel level, int line, int col, string message) { Console.WriteLine("Tidy diagnostic message: " + message); } private int InStrEndOfTag (string input) { bool attr = false; int pos = 0; while (pos < input.Length) { pos++; if (!attr && (input.Substring(pos, 1) == ">")) { return pos; } if (input.Substring(pos, 1) == "") { attr = !attr; } } return pos; } private string RemoveExtraTags(string input) { string temp = input; string output = ""; int pos; string tag, name; while (temp != "") { if (temp.Substring(0, 1) == "<") { pos = InStrEndOfTag(temp); if (pos == 0) { tag = temp.Substring(2); temp = ""; } else { tag = temp.Substring(1, pos - 1); temp = temp.Substring(pos + 1); } name = tag.Split(new Char[] { ' ' })[0].ToLower(); if (name.Substring(0, 1) == "/") { name = name.Substring(1); } if (this._tagMap.Contains(name)) { int val = Convert.ToInt32(this._tagMap[name].ToString()); switch (val) { case -1: pos = temp.ToLower().IndexOf("</" + name + ">"); if (pos > 0) { temp = temp.Substring(pos + name.Length + 3); } break; case 0: output += "<"; if (tag.Substring(0, 1) == "/") { output += "/"; } output += name + ">"; break; case 1: output += "<" + tag + ">"; break; default: break; } } } else { output += temp.Substring(0,1); temp = temp.Substring(1); } } return output; } private string CleanTags(string input) { // run HTML Tidy on content string formatMe = this.TidyHTML(input.Trim()); // get rid of comments first to make tag balancing a little easier formatMe = this.StripComments(formatMe); // remove attributes that are unacceptable in any case (e.g., JavaScript attributes, CSS) formatMe = this.ReplaceNastyAttributes(formatMe); formatMe = RemoveExtraTags(formatMe); // substitute placeholder for relative links formatMe = this.InsertRelativeUrlPlaceholder(formatMe); return formatMe; } private string StripComments(string input) { Regex regex = new Regex("<!--.*-->"); return regex.Replace(input, ""); } private string ReplaceNastyAttributes(string input) { Regex regex = new Regex("( on[a-z]{1,}|style|id)=[\"'](.*?)[\"']"); return regex.Replace(input, ""); } #endregion HTML Markup Handling } }
Initial URL
Initial Description
Requires TidyATL library for .NET - http://www.devx.com/dotnet/Article/20505/1763/page/2
Initial Title
.NET HTML Formatter
Initial Tags
html, Net
Initial Language
C#