Quantcast
Viewing latest article 5
Browse Latest Browse All 5

Truncate HTML String C# Extension Method

My client, a super-huge pharmaceutical company, gave me a particularly good challenge: take a string of HTML and truncate it to a maximum amount of characters, keeping only whole words and all markup.  The trick of course, is to close any open tags, not cut off the text in the middle of a tag, and not to leave a partial word at the end.  I whipped out my trusty pen and paper and started drawing out the algorithm.  It turned out to be pretty straightforward, but there’s a lot to watch out for.

The Basic Idea

  1. Count the raw character count and the text count until you hit the maximum number of text characters
  2. Truncate the string to the character count, keeping whole words
  3. Create a Stack to hold all open tags
  4. Use some nifty regex to identify HTML tags
  5. Loop through and push all tags to the Stack and pop them off as we find closing tags
    1. Try to recover if we have malformed HTML and find tags closed in the wrong order
  6. Pop any remaining tags off the stack and add them as closing tags to our truncated string

You can download the source here, along with some other nice HTML String Extensions like Truncate() and TruncateWords().  I’ve included some unit tests as well.

Source Code (with a few bonus extension methods)

using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace System
{
    public static class StringHtmlExtensions
    {
        /// <summary>
        /// Truncates a string containing HTML to a number of text characters, keeping whole words.
        /// The result contains HTML and any tags left open are closed.
        /// </summary>
        /// <param name="s"></param>
        /// <returns></returns>
        public static string TruncateHtml(this string html, int maxCharacters, string trailingText)
        {
            if (string.IsNullOrEmpty(html))
                return html;

            // find the spot to truncate
            // count the text characters and ignore tags
            var textCount = 0;
            var charCount = 0;
            var ignore = false;
            foreach (char c in html)
            {
                charCount++;
                if (c == '<')
                    ignore = true;
                else if (!ignore)
                    textCount++;

                if (c == '>')
                    ignore = false;

                // stop once we hit the limit
                if (textCount >= maxCharacters)
                    break;
            }

            // Truncate the html and keep whole words only
            var trunc = new StringBuilder(html.TruncateWords(charCount));

            // keep track of open tags and close any tags left open
            var tags = new Stack<string>();
            var matches = Regex.Matches(trunc.ToString(),
                @"<((?<tag>[^\s/>]+)|/(?<closeTag>[^\s>]+)).*?(?<selfClose>/)?\s*>",
                RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline);

            foreach (Match match in matches)
            {
                if (match.Success)
                {
                    var tag = match.Groups["tag"].Value;
                    var closeTag = match.Groups["closeTag"].Value;

                    // push to stack if open tag and ignore it if it is self-closing, i.e. <br />
                    if (!string.IsNullOrEmpty(tag) && string.IsNullOrEmpty(match.Groups["selfClose"].Value))
                        tags.Push(tag);

                    // pop from stack if close tag
                    else if (!string.IsNullOrEmpty(closeTag))
                    {
                        // pop the tag to close it.. find the matching opening tag
                        // ignore any unclosed tags
                        while (tags.Pop() != closeTag && tags.Count > 0)
                        { }
                    }
                }
            }

            if (html.Length > charCount)
                // add the trailing text
                trunc.Append(trailingText);

            // pop the rest off the stack to close remainder of tags
            while (tags.Count > 0)
            {
                trunc.Append("</");
                trunc.Append(tags.Pop());
                trunc.Append('>');
            }

            return trunc.ToString();
        }

        /// <summary>
        /// Truncates a string containing HTML to a number of text characters, keeping whole words.
        /// The result contains HTML and any tags left open are closed.
        /// </summary>
        /// <param name="s"></param>
        /// <returns></returns>
        public static string TruncateHtml(this string html, int maxCharacters)
        {
            return html.TruncateHtml(maxCharacters, null);
        }

        /// <summary>
        /// Strips all HTML tags from a string
        /// </summary>
        /// <param name="s"></param>
        /// <returns></returns>
        public static string StripHtml(this string html)
        {
            if (string.IsNullOrEmpty(html))
                return html;

            return Regex.Replace(html, @"<(.|\n)*?>", string.Empty);
        }

        /// <summary>
        /// Truncates text to a number of characters
        /// </summary>
        /// <param name="text"></param>
        /// <param name="maxCharacters"></param>
        /// <param name="trailingText"></param>
        /// <returns></returns>
        public static string Truncate(this string text, int maxCharacters)
        {
            return text.Truncate(maxCharacters, null);
        }

        /// <summary>
        /// Truncates text to a number of characters and adds trailing text, i.e. elipses, to the end
        /// </summary>
        /// <param name="text"></param>
        /// <param name="maxCharacters"></param>
        /// <param name="trailingText"></param>
        /// <returns></returns>
        public static string Truncate(this string text, int maxCharacters, string trailingText)
        {
            if (string.IsNullOrEmpty(text) || maxCharacters <= 0 || text.Length <= maxCharacters)
                return text;
            else
                return text.Substring(0, maxCharacters) + trailingText;
        }


        /// <summary>
        /// Truncates text and discars any partial words left at the end
        /// </summary>
        /// <param name="text"></param>
        /// <param name="maxCharacters"></param>
        /// <param name="trailingText"></param>
        /// <returns></returns>
        public static string TruncateWords(this string text, int maxCharacters)
        {
            return text.TruncateWords(maxCharacters, null);
        }

        /// <summary>
        /// Truncates text and discars any partial words left at the end
        /// </summary>
        /// <param name="text"></param>
        /// <param name="maxCharacters"></param>
        /// <param name="trailingText"></param>
        /// <returns></returns>
        public static string TruncateWords(this string text, int maxCharacters, string trailingText)
        {
            if (string.IsNullOrEmpty(text) || maxCharacters <= 0 || text.Length <= maxCharacters)
                return text;

            // trunctate the text, then remove the partial word at the end
            return Regex.Replace(text.Truncate(maxCharacters),
                @"\s+[^\s]+$", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled) + trailingText;
        }
    }
}

Unit Tests

using System;
using System.Text;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace Helpers.Net.Tests
{
    /// <summary>
    /// Summary description for StringHtmlExtensionTests
    /// </summary>
    [TestClass]
    public class StringHtmlExtensionTests
    {
        public StringHtmlExtensionTests()
        {
            //
            // TODO: Add constructor logic here
            //
        }

        private TestContext testContextInstance;

        /// <summary>
        ///Gets or sets the test context which provides
        ///information about and functionality for the current test run.
        ///</summary>
        public TestContext TestContext
        {
            get
            {
                return testContextInstance;
            }
            set
            {
                testContextInstance = value;
            }
        }

        #region Additional test attributes
        //
        // You can use the following additional attributes as you write your tests:
        //
        // Use ClassInitialize to run code before running the first test in the class
        // [ClassInitialize()]
        // public static void MyClassInitialize(TestContext testContext) { }
        //
        // Use ClassCleanup to run code after all tests in a class have run
        // [ClassCleanup()]
        // public static void MyClassCleanup() { }
        //
        // Use TestInitialize to run code before running each test 
        // [TestInitialize()]
        // public void MyTestInitialize() { }
        //
        // Use TestCleanup to run code after each test has run
        // [TestCleanup()]
        // public void MyTestCleanup() { }
        //
        #endregion

        [TestMethod]
        public void ToDelimitedStringTest()
        {
            Assert.AreEqual("", (new string[] { }).ToDelimitedString(","));
            Assert.AreEqual("", ((string[])null).ToDelimitedString(","));
            Assert.AreEqual("one", (new string[] { "one" }).ToDelimitedString(", "));
            Assert.AreEqual("one, two", (new string[] { "one", "two" }).ToDelimitedString(", "));
            Assert.AreEqual("one,two", (new string[] { "one", "two" }).ToDelimitedString(","));
        }

        [TestMethod]
        public void StripHtmlTest()
        {
            Assert.IsNull(((string)null).StripHtml());

            Assert.AreEqual("hello", "hello".StripHtml());

            Assert.AreEqual("hello", "he<b>ll</b>o".StripHtml());
        }

        [TestMethod]
        public void TruncateTextTest()
        {
            Assert.IsNull(((string)null).StripHtml());

            string test = "1234567890";
            Assert.AreEqual("12345", test.Truncate(5, null));
            Assert.AreEqual("12345...", test.Truncate(5, "..."));
            Assert.AreEqual(string.Empty, string.Empty.Truncate(5, null));
            Assert.AreEqual("12", "12".Truncate(5));
        }

        [TestMethod]
        public void TruncateHtmlEmptyTest()
        {
            Assert.IsNull(((string)null).TruncateHtml(100));
            Assert.AreEqual(string.Empty.TruncateHtml(100), string.Empty);
        }

        [TestMethod]
        public void TruncateHtmlTextTest()
        {
            // no html test
            Assert.AreEqual("abc".TruncateHtml(10), "abc");
            Assert.AreEqual("abc".TruncateHtml(2), "ab");
        }

        [TestMethod]
        public void TruncateHtmlTest()
        {
            var html = @"<p>aaa <b>bbb</b>
ccc<br> ddd</p>";

            Assert.AreEqual(@"<p>aaa <b>bbb</b>
ccc<br> ddd</p>", html.TruncateHtml(100, "no trailing text")); // it ignores unclosed tags

            Assert.AreEqual(@"<p>aaa <b>bbb</b>
ccc<br>...</br></p>", html.TruncateHtml(14, "..."));

            Assert.AreEqual(@"<p>aaa <b>bbb</b></p>", html.TruncateHtml(10));

            // self closing test
            html = @"<p>hello<br/>there</p>";
            Assert.AreEqual(@"<p>hello<br/>th</p>", html.TruncateHtml(7));

            Assert.AreEqual("<b>i'm</b>", "<b>i'm awesome</b>".TruncateHtml(8));
            Assert.AreEqual("<b>i'm...</b>", "<b>i'm awesome</b>".TruncateHtml(8, "..."));
        }

        [TestMethod]
        public void TruncateWordsTest()
        {
            Assert.IsNull(((string)null).TruncateWords(100));
            Assert.AreEqual(string.Empty, string.Empty.TruncateWords(100));

            Assert.AreEqual("big brown", "big brown beaver".TruncateWords(12));
            Assert.AreEqual("big...", "big brown beaver".TruncateWords(5, "..."));
        }

        [TestMethod]
        public void TruncateWordsBreakingHtmlTagTest()
        {
            // truncates in the middle of a tag
            Assert.AreEqual("<b>i'm", "<b>i'm awesome</b>".TruncateWords(16));
        }
    }
}

The post Truncate HTML String C# Extension Method appeared first on RobVolk.com.


Viewing latest article 5
Browse Latest Browse All 5

Trending Articles