blob: c91aa3af124676acc5b513f9654d2c6e6792be68 [file] [log] [blame]
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package mail implements parsing of mail messages.
For the most part, this package follows the syntax as specified by RFC 5322 and
extended by RFC 6532.
Notable divergences:
- Obsolete address formats are not parsed, including addresses with
embedded route information.
- The full range of spacing (the CFWS syntax element) is not supported,
such as breaking addresses across lines.
- No unicode normalization is performed.
- The special characters ()[]:;@\, are allowed to appear unquoted in names.
*/
package mail
import (
"bufio"
"errors"
"fmt"
"io"
"log"
"mime"
"net/textproto"
"strings"
"sync"
"time"
"unicode/utf8"
)
var debug = debugT(false)
type debugT bool
func (d debugT) Printf(format string, args ...any) {
if d {
log.Printf(format, args...)
}
}
// A Message represents a parsed mail message.
type Message struct {
Header Header
Body io.Reader
}
// ReadMessage reads a message from r.
// The headers are parsed, and the body of the message will be available
// for reading from msg.Body.
func ReadMessage(r io.Reader) (msg *Message, err error) {
tp := textproto.NewReader(bufio.NewReader(r))
hdr, err := tp.ReadMIMEHeader()
if err != nil {
return nil, err
}
return &Message{
Header: Header(hdr),
Body: tp.R,
}, nil
}
// Layouts suitable for passing to time.Parse.
// These are tried in order.
var (
dateLayoutsBuildOnce sync.Once
dateLayouts []string
)
func buildDateLayouts() {
// Generate layouts based on RFC 5322, section 3.3.
dows := [...]string{"", "Mon, "} // day-of-week
days := [...]string{"2", "02"} // day = 1*2DIGIT
years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT
seconds := [...]string{":05", ""} // second
// "-0700 (MST)" is not in RFC 5322, but is common.
zones := [...]string{"-0700", "MST", "UT"} // zone = (("+" / "-") 4DIGIT) / "UT" / "GMT" / ...
for _, dow := range dows {
for _, day := range days {
for _, year := range years {
for _, second := range seconds {
for _, zone := range zones {
s := dow + day + " Jan " + year + " 15:04" + second + " " + zone
dateLayouts = append(dateLayouts, s)
}
}
}
}
}
}
// ParseDate parses an RFC 5322 date string.
func ParseDate(date string) (time.Time, error) {
dateLayoutsBuildOnce.Do(buildDateLayouts)
// CR and LF must match and are tolerated anywhere in the date field.
date = strings.ReplaceAll(date, "\r\n", "")
if strings.Contains(date, "\r") {
return time.Time{}, errors.New("mail: header has a CR without LF")
}
// Re-using some addrParser methods which support obsolete text, i.e. non-printable ASCII
p := addrParser{date, nil}
p.skipSpace()
// RFC 5322: zone = (FWS ( "+" / "-" ) 4DIGIT) / obs-zone
// zone length is always 5 chars unless obsolete (obs-zone)
if ind := strings.IndexAny(p.s, "+-"); ind != -1 && len(p.s) >= ind+5 {
date = p.s[:ind+5]
p.s = p.s[ind+5:]
} else {
ind := strings.Index(p.s, "T")
if ind == 0 {
// In this case we have the following date formats:
// * Thu, 20 Nov 1997 09:55:06 MDT
// * Thu, 20 Nov 1997 09:55:06 MDT (MDT)
// * Thu, 20 Nov 1997 09:55:06 MDT (This comment)
ind = strings.Index(p.s[1:], "T")
if ind != -1 {
ind++
}
}
if ind != -1 && len(p.s) >= ind+5 {
// The last letter T of the obsolete time zone is checked when no standard time zone is found.
// If T is misplaced, the date to parse is garbage.
date = p.s[:ind+1]
p.s = p.s[ind+1:]
}
}
if !p.skipCFWS() {
return time.Time{}, errors.New("mail: misformatted parenthetical comment")
}
for _, layout := range dateLayouts {
t, err := time.Parse(layout, date)
if err == nil {
return t, nil
}
}
return time.Time{}, errors.New("mail: header could not be parsed")
}
// A Header represents the key-value pairs in a mail message header.
type Header map[string][]string
// Get gets the first value associated with the given key.
// It is case insensitive; CanonicalMIMEHeaderKey is used
// to canonicalize the provided key.
// If there are no values associated with the key, Get returns "".
// To access multiple values of a key, or to use non-canonical keys,
// access the map directly.
func (h Header) Get(key string) string {
return textproto.MIMEHeader(h).Get(key)
}
var ErrHeaderNotPresent = errors.New("mail: header not in message")
// Date parses the Date header field.
func (h Header) Date() (time.Time, error) {
hdr := h.Get("Date")
if hdr == "" {
return time.Time{}, ErrHeaderNotPresent
}
return ParseDate(hdr)
}
// AddressList parses the named header field as a list of addresses.
func (h Header) AddressList(key string) ([]*Address, error) {
hdr := h.Get(key)
if hdr == "" {
return nil, ErrHeaderNotPresent
}
return ParseAddressList(hdr)
}
// Address represents a single mail address.
// An address such as "Barry Gibbs <bg@example.com>" is represented
// as Address{Name: "Barry Gibbs", Address: "bg@example.com"}.
type Address struct {
Name string // Proper name; may be empty.
Address string // user@domain
}
// ParseAddress parses a single RFC 5322 address, e.g. "Barry Gibbs <bg@example.com>"
func ParseAddress(address string) (*Address, error) {
return (&addrParser{s: address}).parseSingleAddress()
}
// ParseAddressList parses the given string as a list of addresses.
func ParseAddressList(list string) ([]*Address, error) {
return (&addrParser{s: list}).parseAddressList()
}
// An AddressParser is an RFC 5322 address parser.
type AddressParser struct {
// WordDecoder optionally specifies a decoder for RFC 2047 encoded-words.
WordDecoder *mime.WordDecoder
}
// Parse parses a single RFC 5322 address of the
// form "Gogh Fir <gf@example.com>" or "foo@example.com".
func (p *AddressParser) Parse(address string) (*Address, error) {
return (&addrParser{s: address, dec: p.WordDecoder}).parseSingleAddress()
}
// ParseList parses the given string as a list of comma-separated addresses
// of the form "Gogh Fir <gf@example.com>" or "foo@example.com".
func (p *AddressParser) ParseList(list string) ([]*Address, error) {
return (&addrParser{s: list, dec: p.WordDecoder}).parseAddressList()
}
// String formats the address as a valid RFC 5322 address.
// If the address's name contains non-ASCII characters
// the name will be rendered according to RFC 2047.
func (a *Address) String() string {
// Format address local@domain
at := strings.LastIndex(a.Address, "@")
var local, domain string
if at < 0 {
// This is a malformed address ("@" is required in addr-spec);
// treat the whole address as local-part.
local = a.Address
} else {
local, domain = a.Address[:at], a.Address[at+1:]
}
// Add quotes if needed
quoteLocal := false
for i, r := range local {
if isAtext(r, false, false) {
continue
}
if r == '.' {
// Dots are okay if they are surrounded by atext.
// We only need to check that the previous byte is
// not a dot, and this isn't the end of the string.
if i > 0 && local[i-1] != '.' && i < len(local)-1 {
continue
}
}
quoteLocal = true
break
}
if quoteLocal {
local = quoteString(local)
}
s := "<" + local + "@" + domain + ">"
if a.Name == "" {
return s
}
// If every character is printable ASCII, quoting is simple.
allPrintable := true
for _, r := range a.Name {
// isWSP here should actually be isFWS,
// but we don't support folding yet.
if !isVchar(r) && !isWSP(r) || isMultibyte(r) {
allPrintable = false
break
}
}
if allPrintable {
return quoteString(a.Name) + " " + s
}
// Text in an encoded-word in a display-name must not contain certain
// characters like quotes or parentheses (see RFC 2047 section 5.3).
// When this is the case encode the name using base64 encoding.
if strings.ContainsAny(a.Name, "\"#$%&'(),.:;<>@[]^`{|}~") {
return mime.BEncoding.Encode("utf-8", a.Name) + " " + s
}
return mime.QEncoding.Encode("utf-8", a.Name) + " " + s
}
type addrParser struct {
s string
dec *mime.WordDecoder // may be nil
}
func (p *addrParser) parseAddressList() ([]*Address, error) {
var list []*Address
for {
p.skipSpace()
// allow skipping empty entries (RFC5322 obs-addr-list)
if p.consume(',') {
continue
}
addrs, err := p.parseAddress(true)
if err != nil {
return nil, err
}
list = append(list, addrs...)
if !p.skipCFWS() {
return nil, errors.New("mail: misformatted parenthetical comment")
}
if p.empty() {
break
}
if p.peek() != ',' {
return nil, errors.New("mail: expected comma")
}
// Skip empty entries for obs-addr-list.
for p.consume(',') {
p.skipSpace()
}
if p.empty() {
break
}
}
return list, nil
}
func (p *addrParser) parseSingleAddress() (*Address, error) {
addrs, err := p.parseAddress(true)
if err != nil {
return nil, err
}
if !p.skipCFWS() {
return nil, errors.New("mail: misformatted parenthetical comment")
}
if !p.empty() {
return nil, fmt.Errorf("mail: expected single address, got %q", p.s)
}
if len(addrs) == 0 {
return nil, errors.New("mail: empty group")
}
if len(addrs) > 1 {
return nil, errors.New("mail: group with multiple addresses")
}
return addrs[0], nil
}
// parseAddress parses a single RFC 5322 address at the start of p.
func (p *addrParser) parseAddress(handleGroup bool) ([]*Address, error) {
debug.Printf("parseAddress: %q", p.s)
p.skipSpace()
if p.empty() {
return nil, errors.New("mail: no address")
}
// address = mailbox / group
// mailbox = name-addr / addr-spec
// group = display-name ":" [group-list] ";" [CFWS]
// addr-spec has a more restricted grammar than name-addr,
// so try parsing it first, and fallback to name-addr.
// TODO(dsymonds): Is this really correct?
spec, err := p.consumeAddrSpec()
if err == nil {
var displayName string
p.skipSpace()
if !p.empty() && p.peek() == '(' {
displayName, err = p.consumeDisplayNameComment()
if err != nil {
return nil, err
}
}
return []*Address{{
Name: displayName,
Address: spec,
}}, err
}
debug.Printf("parseAddress: not an addr-spec: %v", err)
debug.Printf("parseAddress: state is now %q", p.s)
// display-name
var displayName string
if p.peek() != '<' {
displayName, err = p.consumePhrase()
if err != nil {
return nil, err
}
}
debug.Printf("parseAddress: displayName=%q", displayName)
p.skipSpace()
if handleGroup {
if p.consume(':') {
return p.consumeGroupList()
}
}
// angle-addr = "<" addr-spec ">"
if !p.consume('<') {
atext := true
for _, r := range displayName {
if !isAtext(r, true, false) {
atext = false
break
}
}
if atext {
// The input is like "foo.bar"; it's possible the input
// meant to be "foo.bar@domain", or "foo.bar <...>".
return nil, errors.New("mail: missing '@' or angle-addr")
}
// The input is like "Full Name", which couldn't possibly be a
// valid email address if followed by "@domain"; the input
// likely meant to be "Full Name <...>".
return nil, errors.New("mail: no angle-addr")
}
spec, err = p.consumeAddrSpec()
if err != nil {
return nil, err
}
if !p.consume('>') {
return nil, errors.New("mail: unclosed angle-addr")
}
debug.Printf("parseAddress: spec=%q", spec)
return []*Address{{
Name: displayName,
Address: spec,
}}, nil
}
func (p *addrParser) consumeGroupList() ([]*Address, error) {
var group []*Address
// handle empty group.
p.skipSpace()
if p.consume(';') {
p.skipCFWS()
return group, nil
}
for {
p.skipSpace()
// embedded groups not allowed.
addrs, err := p.parseAddress(false)
if err != nil {
return nil, err
}
group = append(group, addrs...)
if !p.skipCFWS() {
return nil, errors.New("mail: misformatted parenthetical comment")
}
if p.consume(';') {
p.skipCFWS()
break
}
if !p.consume(',') {
return nil, errors.New("mail: expected comma")
}
}
return group, nil
}
// consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p.
func (p *addrParser) consumeAddrSpec() (spec string, err error) {
debug.Printf("consumeAddrSpec: %q", p.s)
orig := *p
defer func() {
if err != nil {
*p = orig
}
}()
// local-part = dot-atom / quoted-string
var localPart string
p.skipSpace()
if p.empty() {
return "", errors.New("mail: no addr-spec")
}
if p.peek() == '"' {
// quoted-string
debug.Printf("consumeAddrSpec: parsing quoted-string")
localPart, err = p.consumeQuotedString()
if localPart == "" {
err = errors.New("mail: empty quoted string in addr-spec")
}
} else {
// dot-atom
debug.Printf("consumeAddrSpec: parsing dot-atom")
localPart, err = p.consumeAtom(true, false)
}
if err != nil {
debug.Printf("consumeAddrSpec: failed: %v", err)
return "", err
}
if !p.consume('@') {
return "", errors.New("mail: missing @ in addr-spec")
}
// domain = dot-atom / domain-literal
var domain string
p.skipSpace()
if p.empty() {
return "", errors.New("mail: no domain in addr-spec")
}
// TODO(dsymonds): Handle domain-literal
domain, err = p.consumeAtom(true, false)
if err != nil {
return "", err
}
return localPart + "@" + domain, nil
}
// consumePhrase parses the RFC 5322 phrase at the start of p.
func (p *addrParser) consumePhrase() (phrase string, err error) {
debug.Printf("consumePhrase: [%s]", p.s)
// phrase = 1*word
var words []string
var isPrevEncoded bool
for {
// word = atom / quoted-string
var word string
p.skipSpace()
if p.empty() {
break
}
isEncoded := false
if p.peek() == '"' {
// quoted-string
word, err = p.consumeQuotedString()
} else {
// atom
// We actually parse dot-atom here to be more permissive
// than what RFC 5322 specifies.
word, err = p.consumeAtom(true, true)
if err == nil {
word, isEncoded, err = p.decodeRFC2047Word(word)
}
}
if err != nil {
break
}
debug.Printf("consumePhrase: consumed %q", word)
if isPrevEncoded && isEncoded {
words[len(words)-1] += word
} else {
words = append(words, word)
}
isPrevEncoded = isEncoded
}
// Ignore any error if we got at least one word.
if err != nil && len(words) == 0 {
debug.Printf("consumePhrase: hit err: %v", err)
return "", fmt.Errorf("mail: missing word in phrase: %v", err)
}
phrase = strings.Join(words, " ")
return phrase, nil
}
// consumeQuotedString parses the quoted string at the start of p.
func (p *addrParser) consumeQuotedString() (qs string, err error) {
// Assume first byte is '"'.
i := 1
qsb := make([]rune, 0, 10)
escaped := false
Loop:
for {
r, size := utf8.DecodeRuneInString(p.s[i:])
switch {
case size == 0:
return "", errors.New("mail: unclosed quoted-string")
case size == 1 && r == utf8.RuneError:
return "", fmt.Errorf("mail: invalid utf-8 in quoted-string: %q", p.s)
case escaped:
// quoted-pair = ("\" (VCHAR / WSP))
if !isVchar(r) && !isWSP(r) {
return "", fmt.Errorf("mail: bad character in quoted-string: %q", r)
}
qsb = append(qsb, r)
escaped = false
case isQtext(r) || isWSP(r):
// qtext (printable US-ASCII excluding " and \), or
// FWS (almost; we're ignoring CRLF)
qsb = append(qsb, r)
case r == '"':
break Loop
case r == '\\':
escaped = true
default:
return "", fmt.Errorf("mail: bad character in quoted-string: %q", r)
}
i += size
}
p.s = p.s[i+1:]
return string(qsb), nil
}
// consumeAtom parses an RFC 5322 atom at the start of p.
// If dot is true, consumeAtom parses an RFC 5322 dot-atom instead.
// If permissive is true, consumeAtom will not fail on:
// - leading/trailing/double dots in the atom (see golang.org/issue/4938)
// - special characters (RFC 5322 3.2.3) except '<', '>', ':' and '"' (see golang.org/issue/21018)
func (p *addrParser) consumeAtom(dot bool, permissive bool) (atom string, err error) {
i := 0
Loop:
for {
r, size := utf8.DecodeRuneInString(p.s[i:])
switch {
case size == 1 && r == utf8.RuneError:
return "", fmt.Errorf("mail: invalid utf-8 in address: %q", p.s)
case size == 0 || !isAtext(r, dot, permissive):
break Loop
default:
i += size
}
}
if i == 0 {
return "", errors.New("mail: invalid string")
}
atom, p.s = p.s[:i], p.s[i:]
if !permissive {
if strings.HasPrefix(atom, ".") {
return "", errors.New("mail: leading dot in atom")
}
if strings.Contains(atom, "..") {
return "", errors.New("mail: double dot in atom")
}
if strings.HasSuffix(atom, ".") {
return "", errors.New("mail: trailing dot in atom")
}
}
return atom, nil
}
func (p *addrParser) consumeDisplayNameComment() (string, error) {
if !p.consume('(') {
return "", errors.New("mail: comment does not start with (")
}
comment, ok := p.consumeComment()
if !ok {
return "", errors.New("mail: misformatted parenthetical comment")
}
// TODO(stapelberg): parse quoted-string within comment
words := strings.FieldsFunc(comment, func(r rune) bool { return r == ' ' || r == '\t' })
for idx, word := range words {
decoded, isEncoded, err := p.decodeRFC2047Word(word)
if err != nil {
return "", err
}
if isEncoded {
words[idx] = decoded
}
}
return strings.Join(words, " "), nil
}
func (p *addrParser) consume(c byte) bool {
if p.empty() || p.peek() != c {
return false
}
p.s = p.s[1:]
return true
}
// skipSpace skips the leading space and tab characters.
func (p *addrParser) skipSpace() {
p.s = strings.TrimLeft(p.s, " \t")
}
func (p *addrParser) peek() byte {
return p.s[0]
}
func (p *addrParser) empty() bool {
return p.len() == 0
}
func (p *addrParser) len() int {
return len(p.s)
}
// skipCFWS skips CFWS as defined in RFC5322.
func (p *addrParser) skipCFWS() bool {
p.skipSpace()
for {
if !p.consume('(') {
break
}
if _, ok := p.consumeComment(); !ok {
return false
}
p.skipSpace()
}
return true
}
func (p *addrParser) consumeComment() (string, bool) {
// '(' already consumed.
depth := 1
var comment string
for {
if p.empty() || depth == 0 {
break
}
if p.peek() == '\\' && p.len() > 1 {
p.s = p.s[1:]
} else if p.peek() == '(' {
depth++
} else if p.peek() == ')' {
depth--
}
if depth > 0 {
comment += p.s[:1]
}
p.s = p.s[1:]
}
return comment, depth == 0
}
func (p *addrParser) decodeRFC2047Word(s string) (word string, isEncoded bool, err error) {
dec := p.dec
if dec == nil {
dec = &rfc2047Decoder
}
// Substitute our own CharsetReader function so that we can tell
// whether an error from the Decode method was due to the
// CharsetReader (meaning the charset is invalid).
// We used to look for the charsetError type in the error result,
// but that behaves badly with CharsetReaders other than the
// one in rfc2047Decoder.
adec := *dec
charsetReaderError := false
adec.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
if dec.CharsetReader == nil {
charsetReaderError = true
return nil, charsetError(charset)
}
r, err := dec.CharsetReader(charset, input)
if err != nil {
charsetReaderError = true
}
return r, err
}
word, err = adec.Decode(s)
if err == nil {
return word, true, nil
}
// If the error came from the character set reader
// (meaning the character set itself is invalid
// but the decoding worked fine until then),
// return the original text and the error,
// with isEncoded=true.
if charsetReaderError {
return s, true, err
}
// Ignore invalid RFC 2047 encoded-word errors.
return s, false, nil
}
var rfc2047Decoder = mime.WordDecoder{
CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
return nil, charsetError(charset)
},
}
type charsetError string
func (e charsetError) Error() string {
return fmt.Sprintf("charset not supported: %q", string(e))
}
// isAtext reports whether r is an RFC 5322 atext character.
// If dot is true, period is included.
// If permissive is true, RFC 5322 3.2.3 specials is included,
// except '<', '>', ':' and '"'.
func isAtext(r rune, dot, permissive bool) bool {
switch r {
case '.':
return dot
// RFC 5322 3.2.3. specials
case '(', ')', '[', ']', ';', '@', '\\', ',':
return permissive
case '<', '>', '"', ':':
return false
}
return isVchar(r)
}
// isQtext reports whether r is an RFC 5322 qtext character.
func isQtext(r rune) bool {
// Printable US-ASCII, excluding backslash or quote.
if r == '\\' || r == '"' {
return false
}
return isVchar(r)
}
// quoteString renders a string as an RFC 5322 quoted-string.
func quoteString(s string) string {
var buf strings.Builder
buf.WriteByte('"')
for _, r := range s {
if isQtext(r) || isWSP(r) {
buf.WriteRune(r)
} else if isVchar(r) {
buf.WriteByte('\\')
buf.WriteRune(r)
}
}
buf.WriteByte('"')
return buf.String()
}
// isVchar reports whether r is an RFC 5322 VCHAR character.
func isVchar(r rune) bool {
// Visible (printing) characters.
return '!' <= r && r <= '~' || isMultibyte(r)
}
// isMultibyte reports whether r is a multi-byte UTF-8 character
// as supported by RFC 6532
func isMultibyte(r rune) bool {
return r >= utf8.RuneSelf
}
// isWSP reports whether r is a WSP (white space).
// WSP is a space or horizontal tab (RFC 5234 Appendix B).
func isWSP(r rune) bool {
return r == ' ' || r == '\t'
}