Sunday, November 14, 2010

How to decode Encoded-Word header values in emails

Here is C# code to decode Encoded-Word header values in emails.
Features:

  • supports encoding of multi-line values
  • supports both Q- and base64 encoding
  • passes all samples in RFC 2047, section "8. Examples"
public class EncodedWordEncoding
    {
        public static string Decode(string encodedText)
        {
            if (encodedText == null)
                return null;

            var regex = new Regex(@"=\?(?<charset>.*?)\?(?<encoding>[qQbB])\?(?<value>.*?)\?=");
            var encodedString = encodedText;
            var decodedString = string.Empty;
            var encodedWordBefore = false;

            while (encodedString.Length > 0)
            {
                var match = regex.Match(encodedString);
                if (match.Success)
                {

                    // If the match isn't at the start of the string, copy the initial few chars to the output
                    var beforeMatch = encodedString.Substring(0, match.Index);

                    // Filter out space chars between encoded words
                    if (encodedWordBefore)
                    {
                        var regex2 = new Regex(@"(\r?\n|\r)*[ \t]+");
                        var match2 = regex2.Match(beforeMatch);
                        if (match2.Success && match2.Value == beforeMatch)
                            beforeMatch = "";
                    }

                    decodedString += beforeMatch;

                    var charset = match.Groups["charset"].Value;
                    var encoding = match.Groups["encoding"].Value.ToUpper();
                    var value = match.Groups["value"].Value;

                    var contentEncoding = Encoding.GetEncoding(charset);

                    if (encoding.Equals("B"))
                    {
                        // Encoded value is Base-64
                        var bytes = Convert.FromBase64String(value);
                        decodedString += contentEncoding.GetString(bytes);
                    }
                    else if (encoding.Equals("Q"))
                    {
                        decodedString +=
                           QuotedPrintableEncoding.Decode(value, contentEncoding).Replace("_", " ");
                    }
                    else
                    {
                        // Encoded value not known, return original string
                        // (Match should not be successful in this case, so this code may never get hit)
                        decodedString += encodedString;
                        break;
                    }

                    // Trim off up to and including the match, then we'll loop and try matching again.
                    encodedString = encodedString.Substring(match.Index + match.Length);

                    encodedWordBefore = true;
                }
                else
                {
                    // No match, not encoded, return original string
                    decodedString += encodedString;
                    break;
                }
            }

            return decodedString;
        }
    }

    public static class QuotedPrintableEncoding
    {
        public static string Decode(string text, Encoding encoding)
        {
            if (text == null)
                throw new ArgumentNullException("text");
            if (encoding == null)
                throw new ArgumentNullException("encoding");

            if (text.Length == 0)
                return text;

            text = text.Replace("=\r\n", "");

            var regex = new Regex(@"(=[0-9A-F][0-9A-F])+", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            return regex.Replace(text, m =>
                {
                    if (m.Value.Length % 3 != 0)
                        throw new InvalidOperationException("Unexpected match length.");

                    var bytes = m.Value.Split(new[] { '=' }, StringSplitOptions.RemoveEmptyEntries)
                        .Select(v => byte.Parse(v, NumberStyles.HexNumber));

                    return encoding.GetString(bytes.ToArray());
                });
        }
    }

Code used from klaas114's post and Dave's post.

UPDATE 31 Jan 2010: implemented my own quoted-printable decoder.

No comments:

Post a Comment