My client, a super-huge pharmaceutical company, gave me a particularly good challenge: take a string of HTML and truncate it to a maximum amount of characters, keeping only whole words and all markup. The trick of course, is to close any open tags, not cut off the text in the middle of a tag, and not to leave a partial word at the end. I whipped out my trusty pen and paper and started drawing out the algorithm. It turned out to be pretty straightforward, but there’s a lot to watch out for.
The Basic Idea
- Count the raw character count and the text count until you hit the maximum number of text characters
- Truncate the string to the character count, keeping whole words
- Create a Stack to hold all open tags
- Use some nifty regex to identify HTML tags
- Loop through and push all tags to the Stack and pop them off as we find closing tags
- Try to recover if we have malformed HTML and find tags closed in the wrong order
- Pop any remaining tags off the stack and add them as closing tags to our truncated string
You can download the source here, along with some other nice HTML String Extensions like Truncate() and TruncateWords(). I’ve included some unit tests as well.
Source Code (with a few bonus extension methods)
- Source hosted at my Helpers.Net GitHub Project
using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; namespace System { public static class StringHtmlExtensions { /// <summary> /// Truncates a string containing HTML to a number of text characters, keeping whole words. /// The result contains HTML and any tags left open are closed. /// </summary> /// <param name="s"></param> /// <returns></returns> public static string TruncateHtml(this string html, int maxCharacters, string trailingText) { if (string.IsNullOrEmpty(html)) return html; // find the spot to truncate // count the text characters and ignore tags var textCount = 0; var charCount = 0; var ignore = false; foreach (char c in html) { charCount++; if (c == '<') ignore = true; else if (!ignore) textCount++; if (c == '>') ignore = false; // stop once we hit the limit if (textCount >= maxCharacters) break; } // Truncate the html and keep whole words only var trunc = new StringBuilder(html.TruncateWords(charCount)); // keep track of open tags and close any tags left open var tags = new Stack<string>(); var matches = Regex.Matches(trunc.ToString(), @"<((?<tag>[^\s/>]+)|/(?<closeTag>[^\s>]+)).*?(?<selfClose>/)?\s*>", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline); foreach (Match match in matches) { if (match.Success) { var tag = match.Groups["tag"].Value; var closeTag = match.Groups["closeTag"].Value; // push to stack if open tag and ignore it if it is self-closing, i.e. <br /> if (!string.IsNullOrEmpty(tag) && string.IsNullOrEmpty(match.Groups["selfClose"].Value)) tags.Push(tag); // pop from stack if close tag else if (!string.IsNullOrEmpty(closeTag)) { // pop the tag to close it.. find the matching opening tag // ignore any unclosed tags while (tags.Pop() != closeTag && tags.Count > 0) { } } } } if (html.Length > charCount) // add the trailing text trunc.Append(trailingText); // pop the rest off the stack to close remainder of tags while (tags.Count > 0) { trunc.Append("</"); trunc.Append(tags.Pop()); trunc.Append('>'); } return trunc.ToString(); } /// <summary> /// Truncates a string containing HTML to a number of text characters, keeping whole words. /// The result contains HTML and any tags left open are closed. /// </summary> /// <param name="s"></param> /// <returns></returns> public static string TruncateHtml(this string html, int maxCharacters) { return html.TruncateHtml(maxCharacters, null); } /// <summary> /// Strips all HTML tags from a string /// </summary> /// <param name="s"></param> /// <returns></returns> public static string StripHtml(this string html) { if (string.IsNullOrEmpty(html)) return html; return Regex.Replace(html, @"<(.|\n)*?>", string.Empty); } /// <summary> /// Truncates text to a number of characters /// </summary> /// <param name="text"></param> /// <param name="maxCharacters"></param> /// <param name="trailingText"></param> /// <returns></returns> public static string Truncate(this string text, int maxCharacters) { return text.Truncate(maxCharacters, null); } /// <summary> /// Truncates text to a number of characters and adds trailing text, i.e. elipses, to the end /// </summary> /// <param name="text"></param> /// <param name="maxCharacters"></param> /// <param name="trailingText"></param> /// <returns></returns> public static string Truncate(this string text, int maxCharacters, string trailingText) { if (string.IsNullOrEmpty(text) || maxCharacters <= 0 || text.Length <= maxCharacters) return text; else return text.Substring(0, maxCharacters) + trailingText; } /// <summary> /// Truncates text and discars any partial words left at the end /// </summary> /// <param name="text"></param> /// <param name="maxCharacters"></param> /// <param name="trailingText"></param> /// <returns></returns> public static string TruncateWords(this string text, int maxCharacters) { return text.TruncateWords(maxCharacters, null); } /// <summary> /// Truncates text and discars any partial words left at the end /// </summary> /// <param name="text"></param> /// <param name="maxCharacters"></param> /// <param name="trailingText"></param> /// <returns></returns> public static string TruncateWords(this string text, int maxCharacters, string trailingText) { if (string.IsNullOrEmpty(text) || maxCharacters <= 0 || text.Length <= maxCharacters) return text; // trunctate the text, then remove the partial word at the end return Regex.Replace(text.Truncate(maxCharacters), @"\s+[^\s]+$", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled) + trailingText; } } }
Unit Tests
using System; using System.Text; using System.Collections.Generic; using System.Linq; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace Helpers.Net.Tests { /// <summary> /// Summary description for StringHtmlExtensionTests /// </summary> [TestClass] public class StringHtmlExtensionTests { public StringHtmlExtensionTests() { // // TODO: Add constructor logic here // } private TestContext testContextInstance; /// <summary> ///Gets or sets the test context which provides ///information about and functionality for the current test run. ///</summary> public TestContext TestContext { get { return testContextInstance; } set { testContextInstance = value; } } #region Additional test attributes // // You can use the following additional attributes as you write your tests: // // Use ClassInitialize to run code before running the first test in the class // [ClassInitialize()] // public static void MyClassInitialize(TestContext testContext) { } // // Use ClassCleanup to run code after all tests in a class have run // [ClassCleanup()] // public static void MyClassCleanup() { } // // Use TestInitialize to run code before running each test // [TestInitialize()] // public void MyTestInitialize() { } // // Use TestCleanup to run code after each test has run // [TestCleanup()] // public void MyTestCleanup() { } // #endregion [TestMethod] public void ToDelimitedStringTest() { Assert.AreEqual("", (new string[] { }).ToDelimitedString(",")); Assert.AreEqual("", ((string[])null).ToDelimitedString(",")); Assert.AreEqual("one", (new string[] { "one" }).ToDelimitedString(", ")); Assert.AreEqual("one, two", (new string[] { "one", "two" }).ToDelimitedString(", ")); Assert.AreEqual("one,two", (new string[] { "one", "two" }).ToDelimitedString(",")); } [TestMethod] public void StripHtmlTest() { Assert.IsNull(((string)null).StripHtml()); Assert.AreEqual("hello", "hello".StripHtml()); Assert.AreEqual("hello", "he<b>ll</b>o".StripHtml()); } [TestMethod] public void TruncateTextTest() { Assert.IsNull(((string)null).StripHtml()); string test = "1234567890"; Assert.AreEqual("12345", test.Truncate(5, null)); Assert.AreEqual("12345...", test.Truncate(5, "...")); Assert.AreEqual(string.Empty, string.Empty.Truncate(5, null)); Assert.AreEqual("12", "12".Truncate(5)); } [TestMethod] public void TruncateHtmlEmptyTest() { Assert.IsNull(((string)null).TruncateHtml(100)); Assert.AreEqual(string.Empty.TruncateHtml(100), string.Empty); } [TestMethod] public void TruncateHtmlTextTest() { // no html test Assert.AreEqual("abc".TruncateHtml(10), "abc"); Assert.AreEqual("abc".TruncateHtml(2), "ab"); } [TestMethod] public void TruncateHtmlTest() { var html = @"<p>aaa <b>bbb</b> ccc<br> ddd</p>"; Assert.AreEqual(@"<p>aaa <b>bbb</b> ccc<br> ddd</p>", html.TruncateHtml(100, "no trailing text")); // it ignores unclosed tags Assert.AreEqual(@"<p>aaa <b>bbb</b> ccc<br>...</br></p>", html.TruncateHtml(14, "...")); Assert.AreEqual(@"<p>aaa <b>bbb</b></p>", html.TruncateHtml(10)); // self closing test html = @"<p>hello<br/>there</p>"; Assert.AreEqual(@"<p>hello<br/>th</p>", html.TruncateHtml(7)); Assert.AreEqual("<b>i'm</b>", "<b>i'm awesome</b>".TruncateHtml(8)); Assert.AreEqual("<b>i'm...</b>", "<b>i'm awesome</b>".TruncateHtml(8, "...")); } [TestMethod] public void TruncateWordsTest() { Assert.IsNull(((string)null).TruncateWords(100)); Assert.AreEqual(string.Empty, string.Empty.TruncateWords(100)); Assert.AreEqual("big brown", "big brown beaver".TruncateWords(12)); Assert.AreEqual("big...", "big brown beaver".TruncateWords(5, "...")); } [TestMethod] public void TruncateWordsBreakingHtmlTagTest() { // truncates in the middle of a tag Assert.AreEqual("<b>i'm", "<b>i'm awesome</b>".TruncateWords(16)); } } }
The post Truncate HTML String C# Extension Method appeared first on RobVolk.com.