Regular Expressions in C# – Part 5 – Groups

In regular expression we can use groups to parse our subject with sub strings in our pattern. These groups are also found in our Match object, so we can retrieve the matches for each group. Groups are expressed by using parentheses '()'. Groups are counted from left to right starting with the whole pattern.

Access to group match values

We test some real life examples of regular expression patterns with groups. But before we do that we need a little helper to print out the actual group matches. We do this by printing the Groups collection from the Match object.

using System.Diagnostics;
using System.Text.RegularExpressions;
 
namespace RegularExpressions.Tests.Helpers
{
    public class DebugWriter
    {
        public static void WriteGroups(Match match)
        {
            var index = 0;
 
            foreach (var group in match.Groups)
            {
                Debug.WriteLine("Group {0}: {1}", index, group);
                index++;
            }
        }
    }
}

Match a Postal Code

In the Netherlands we use a postal code format of four digits and two uppercase alphabetic characters (1234 AB). Sometimes there is a space between the numeric and alphabetic characters… sometimes not, but both are valid postal codes. See the test below for a regular expression that matches both occurrences and returns the numeric and alphabetic part as separate groups.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using RegularExpressions.Tests.Helpers;
 
namespace RegularExpressions.Tests.Part05
{
    [TestClass]
    public class Groups
    {
        [TestMethod]
        public void Match_PostcalCode_With_Space_Character()
        {
            const string pattern = @"^([0-9]{4}) ?([A-Z]{2})";
            const string subject = "4841 AB";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteGroups(match);
            }
 
            Assert.AreEqual(1, matches.Count);
 
            // Debug Trace:
            // Group 0: 4841 AB
            // Group 1: 4841
            // Group 2: AB
        }
 
        [TestMethod]
        public void Match_PostcalCode_Without_Space_Character()
        {
            const string pattern = @"^([0-9]{4}) ?([A-Z]{2})";
            const string subject = "4841AB";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteGroups(match);
            }
 
            Assert.AreEqual(1, matches.Count);
 
            // Debug Trace:
            // Group 0: 4841AB
            // Group 1: 4841
            // Group 2: AB
        }
    }
}

The first group is the numeric character sequence of four ([0-9]{4}). The second group is a pair of uppercase alphabetic characters ([A-Z]{2}). Between these group we have a space character with a question mark to express that there can be a space between these groups ' ?'. Three groups are returned; the whole pattern, the first and the second group. Both tests return the same result.

Regular Expressions in C# – Part 4 – Wild Character

We use the dot character '.' to match any character in a regular expression pattern. It is called a wild character. This includes spaces, but not the newline character. If we want to match only word boundaries we use the /b anchor. If the character between these boundaries must be alpha-numeric (or underscore) [a-zA-z0-9_] we can use the shorthand \w instead of the dot. Here are a few examples.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using RegularExpressions.Tests.Helpers;
 
namespace RegularExpressions.Tests.Part04
{
    [TestClass]
    public class WildCharacters
    {
        [TestMethod]
        public void Each_Character_Produces_Match_Except_NewLine()
        {
            const string pattern = ".";
            const string subject = "boy\ngirl";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(7, matches.Count);
 
            // Debug Trace:
            // 0: 1: b
            // 1: 1: o
            // 2: 1: y
            // 4: 1: g
            // 5: 1: i
            // 6: 1: r
            // 7: 1: l
        }
 
        [TestMethod]
        public void Matches_Each_Boundery_Of_Three_Characters()
        {
            const string pattern = @"\b.{3}\b";
            const string subject = "man bear pig xx";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(3, matches.Count);
 
            // Debug Trace:
            // 0: 3: man
            // 9: 3: pig
            // 12: 3:  xx <- Note: space-x-x is also a match
        }
 
        [TestMethod]
        public void Matches_Each_Word_Of_Three_Characters()
        {
            const string pattern = @"\b\w{3}\b";
            const string subject = "man bear pig xx";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(2, matches.Count);
 
            // Debug Trace:
            // 0: 3: man
            // 9: 3: pig
        }
 
        [TestMethod]
        public void Matches_Each_Word_Of_Any_Length_Of_Characters_Starting_With_P()
        {
            const string pattern = @"p\w+";
            const string subject = "man bear pig\n pothole";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(2, matches.Count);
 
            // Debug Trace:
            // 9: 3: pig
            // 14: 7: pothole
        }
    }
}

Regular Expressions in C# – Part 3 – Anchors

In regular expressions we can use the circumflex character (^) to express the beginning of a line or string. If we want to express the end of a line or a string, we can use the dollar sign ($). The m-modifier expresses a multiline string (?m). The DebugWriter’s WriteMatch helper method from the previous example prints the matches found in the tests below.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using RegularExpressions.Tests.Helpers;
 
namespace RegularExpressions.Tests
{
    [TestClass]
    public class Anchors
    {
        [TestMethod]
        public void Boy_Should_Be_Found_At_The_Start_Of_The_First_Line()
        {
            const string pattern = "^boy";
            const string subject = "boygirlboy\nboy";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(1, matches.Count);
 
            // boy is found only at the first line
            // and not after the newline character
 
            // Debug Trace:
            // 0: 3: boy
        }
 
        [TestMethod]
        public void Boy_Should_Be_Found_At_The_Start_Of_Multiple_Lines()
        {
            const string pattern = "(?m)^boy";
            const string subject = "boygirlboy\nboy";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(2, matches.Count);
 
            // Since this is a multiline string, boy is found
            // on both lines at the beginning
 
            // Debug Trace:
            // 0: 3: boy
            // 11: 3: boy
        }
 
        [TestMethod]
        public void Boy_And_Girl_Should_Be_Found_At_The_Start_Of_Multiple_Lines()
        {
            const string pattern = "(?m)^boy|^girl";
            const string subject = "boygirlboy\ngirl\nboy";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(3, matches.Count);
 
            // Since this is a multiline string, both boy and girl
            // are found on all lines at the beginning
 
            // Debug Trace:
            // 0: 3: boy
            // 11: 4: girl
            // 16: 3: boy
        }
 
        [TestMethod]
        public void Only_Boy_Should_Be_Found_At_The_Start_Of_Multiple_Lines()
        {
            const string pattern = "(?m:^boy)|^girl";
            const string subject = "boygirlboy\ngirl\nboy";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(2, matches.Count);
 
            // the m modifier restricted to 'boy' only
            // girl is not found after the first line
 
            // Debug Trace:
            // 0: 3: boy
            // 16: 3: boy
        }
 
        [TestMethod]
        public void Boy_Should_Be_Found_At_The_End_Of_The_Subject()
        {
            const string pattern = "boy$";
            const string subject = "boygirlboy\nboy\n";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(1, matches.Count);
 
            // boy is found at the end of the string. This may
            // be followed by a single newline character
 
            // Debug Trace:
            // 11: 3: boy
        }
 
        [TestMethod]
        public void Boy_Should_Be_Found_At_The_End_Of_Multiple_Lines()
        {
            const string pattern = "(?m)boy$";
            const string subject = "boygirlboy\nboy";
            var regEx = new Regex(pattern);
            MatchCollection matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(2, matches.Count);
 
            // boy is now foud on all lines, since this
            // is a multiline string
 
            // Debug Trace:
            // 7: 3: boy
            // 11: 3: boy
        }
    }
}

Regular Expressions in C# – Part 2 – Matches and NextMatch

In the previous example on Regular Expressions we only matched the first occurrence of a pattern in a subject string. By using the NextMatch method on the Match object we can iterate through all subsequent matches in the subject. As the name implies, calling NextMatch returns the next Match object from the subject.

Printing match results

Let’s write a little helper to print out the match results. It’s a simple method printing the start position, length and contents of the match to the debug output. We can use this in our tests.

using System.Diagnostics;
using System.Text.RegularExpressions;
 
namespace RegularExpressions.Tests.Helpers
{
    public class DebugWriter
    {
        internal static void WriteMatch(Match match, string subject)
        {
            Debug.WriteLine("{0}: {1}: {2}",
                            match.Index,
                            match.Length,
                            subject.Substring(match.Index, match.Length));
        }
    }
}

Using NextMatch

The use of the NextMatch method is pretty straightforward. Be aware that this can result in unexpected behavior if we are not careful with repetition operators like ? and *. For these operators always return a successful match, even if a pattern is not found. They simple return an empty length match.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using RegularExpressions.Tests.Helpers;
 
namespace RegularExpressions.Tests
{
    [TestClass]
    public class NextMatch
    {
        [TestMethod]
        public void Pattern_Should_Be_Found_Five_Times_With_Star()
        {
            const string pattern = "a*";
            const string subject = "aaaabcaa";
            var regEx = new Regex(pattern);
            int counter = 0;
 
            Match match = regEx.Match(subject);
 
            while (match.Success)
            {
                counter++;
                DebugWriter.WriteMatch(match, subject);
                match = match.NextMatch();
            }
 
            Assert.AreEqual(5, counter);
 
            // Debug Trace:
            // 0: 4: aaaa
            // 4: 0: 
            // 5: 0: 
            // 6: 2: aa
            // 8: 0: 
        }
 
        [TestMethod]
        public void Pattern_Should_Be_Found_Nine_Times_With_Questionmark()
        {
            const string pattern = "a?";
            const string subject = "aaaabcaa";
            var regEx = new Regex(pattern);
            int counter = 0;
 
            Match match = regEx.Match(subject);
 
            while (match.Success)
            {
                counter++;
                DebugWriter.WriteMatch(match, subject);
                match = match.NextMatch();
            }
 
            Assert.AreEqual(9, counter);
 
            // Debug Trace:
            // 0: 1: a
            // 1: 1: a
            // 2: 1: a
            // 3: 1: a
            // 4: 0: 
            // 5: 0: 
            // 6: 1: a
            // 7: 1: a
            // 8: 0:
        }
 
        [TestMethod]
        public void Pattern_Should_Be_Found_Two_Times_With_Plus()
        {
            const string pattern = "a+";
            const string subject = "aaaabcaa";
            var regEx = new Regex(pattern);
            int counter = 0;
 
            Match match = regEx.Match(subject);
 
            while (match.Success)
            {
                counter++;
                DebugWriter.WriteMatch(match, subject);
                match = match.NextMatch();
            }
 
            Assert.AreEqual(2, counter);
 
            // Debug Trace:
            // 0: 4: aaaa
            // 6: 2: aa
        }
    }
}

Using Matches

A more convenient way of iteration through all matches is by using the Matches method on the Regular Expressions class. It returns a MatchCollection we can query with LINQ for instance.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using RegularExpressions.Tests.Helpers;
 
namespace RegularExpressions.Tests
{
    [TestClass]
    public class Matches
    {
        [TestMethod]
        public void Pattern_Should_Be_Found_Two_Times_With_Plus()
        {
            const string pattern = "a+";
            const string subject = "aaaabcaa";
            var regEx = new Regex(pattern);
            var matches = regEx.Matches(subject);
 
            foreach (Match match in matches)
            {
                DebugWriter.WriteMatch(match, subject);
            }
 
            Assert.AreEqual(2, matches.Count);
 
            // Debug Trace:
            // 0: 4: aaaa
            // 6: 2: aa
        }
    }
}

Regular Expressions in C# – Part 1 – Basics

Since I will be writing a lot of validation code in the coming weeks, I decided to dive a little deeper into creating and using Regular Expressions. These little cryptic patterns are very useful when it comes to validating business rules on data fields. It just takes a little practice to create useful regular expression patterns. First the basics…

From MSDN – Regular expressions provide a powerful, flexible, and efficient method for processing text. The extensive pattern-matching notation of regular expressions enables you to quickly parse large amounts of text to find specific character patterns; to validate text to ensure that it matches a predefined pattern (such as an e-mail address); to extract, edit, replace, or delete text substrings; and to add the extracted strings to a collection in order to generate a report.

Using the Regex class in .NET

In the System.Text.RegularExpressions namespace we find the Regex class. Just new it up with some regular expression pattern string. Once instantiated you can’t change this pattern. Call the Match method and pass it whatever subject you want to match the pattern with. It returns a Match object. The Success property of this object tells us if we have a match.

var regEx = new Regex(pattern);
var match = regEx.Match(subject);
 
if (match.Success)
{
    var result = "We have a match";
}

Concatenation

Matching on a sequence of characters is fairly simple. Finding a concatenation (like ‘cat’) in a subject string is done like in the test code below. We just check to see the starting index of the match and the length of the match:

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
 
namespace RegularExpressions.Tests
{
    [TestClass]
    public class Concatenation
    {
        [TestMethod]
        public void Cat_Should_Be_Found()
        {
            const string pattern = "cat";
            const string subject = "dogcat";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 3 && match.Length == 3);
        }
 
        [TestMethod]
        public void Cat_Should_Be_Found_First_Occurence()
        {
            const string pattern = "cat";
            const string subject = "catdogcat";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 3);
        }
 
        [TestMethod]
        public void Bird_Should_Not_Be_Found()
        {
            const string pattern = "bird";
            const string subject = "dogcat";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsFalse(match.Success);
        }
    }
}

Alternation

So what if we want to find a cat or a dog in a subject? We use the ‘|’ alternation sign. Going from left to right, the first occurrence it finds wins the match.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
 
namespace RegularExpressions.Tests
{
    [TestClass]
    public class Alternation
    {
        [TestMethod]
        public void Dog_Should_Be_Found()
        {
            const string pattern = "cat|dog";
            const string subject = "dogcat";
            var regEx = new Regex(pattern);
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 3);
        }
 
        [TestMethod]
        public void Dogcat_Should_Be_Found()
        {
            const string pattern = "dogcat|cat";
            const string subject = "dogcat";
            var regEx = new Regex(pattern);
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 6);
        }
 
        [TestMethod]
        public void Dog_Should_Be_Found_First()
        {
            const string pattern = "cat|dog|dogcat";
            const string subject = "dogcat";
            var regEx = new Regex(pattern);
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 3);
        }
 
        [TestMethod]
        public void Catcat_Should_Be_Found()
        {
            const string pattern = "catcatcat|catcat|cat";
            const string subject = "dogcatcatdogdog";
            var regEx = new Regex(pattern);
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 3 && match.Length == 6);
        }
    }
}

Repetition

If we want to find the occurrence of some sequence repeated a number of times, we can use the ‘*’ or ‘+’ operators. The ‘*’ operator is always successful, even if it doesn’t find a match. In that case it simply returns a zero length match. The ‘+’ is a more picky repetition finder and will fail if it doesn’t match. We can search for a range by using the {0,2} notation. This will find the repetition zero, ones or twice (larges first). The ‘?’ is a shorthand for {0,1}, which is zero or single occurrence.

using System.Text.RegularExpressions;
using Microsoft.VisualStudio.TestTools.UnitTesting;
 
namespace RegularExpressions.Tests
{
    [TestClass]
    public class Repetition
    {
        [TestMethod]
        public void Character_A_Should_Be_Found_Once()
        {
            const string pattern = "a*";
            const string subject = "abc";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 1);
        }
 
        [TestMethod]
        public void Character_A_Should_Be_Found_Repeated_Four_Times()
        {
            const string pattern = "a*";
            const string subject = "aaaabcaa";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 4);
        }
 
        [TestMethod]
        public void Character_A_Should_Be_Found_Length_Zero()
        {
            const string pattern = "a*";
            const string subject = "bcdefgh";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 0);
        }
 
        [TestMethod]
        public void Aa_Should_Be_Found_Repeated_Twice()
        {
            const string pattern = "(aa)*";
            const string subject = "aaaaa";
            var regEx = new Regex(pattern);
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 4);
        }
 
        [TestMethod]
        public void Dog_Should_Be_Found_Length_Zero()
        {
            // Repetion matching with * is always succesfull
            // Simply returns 0 as length (empty match)
 
            const string pattern = "(cat)*";
            const string subject = "dogcatcat";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 0);
        }
 
        [TestMethod]
        public void Catcat_Should_Be_Found_Range_Zero_To_Two()
        {
            const string pattern = "(cat){0,2}";
            const string subject = "catcatcatcatdog";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 0 && match.Length == 6);
        }
 
        [TestMethod]
        public void Cat_Should_Be_Found_Exactly_Ones()
        {
            const string pattern = "(cat){1}";
            const string subject = "catcatcatcatdog";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Success);
        }
 
        [TestMethod]
        public void Cat_Should_Be_Found_Once()
        {
            // ? is shortcut for zero or once
            const string pattern = "(cat)?";
            const string subject = "catcatcatcatdog";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Success);
        }
 
        [TestMethod]
        public void Catcat_Should_Be_Found_Twice_With_Plus()
        {
            // The + operator finds the exact match
            // and doesn't return a zero length match
 
            const string pattern = "(catcat)+";
            const string subject = "dogcatcatcatcatdog";
            var regEx = new Regex(pattern);
 
            Match match = regEx.Match(subject);
 
            Assert.IsTrue(match.Index == 3 && match.Length == 12);
        }
    }
}

Pretty basic stuff huh? Next time we dive in deeper…