| // Copyright 2016 The Go Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style | 
 | // license that can be found in the LICENSE file. | 
 |  | 
 | // Package bidirule implements the Bidi Rule defined by RFC 5893. | 
 | // | 
 | // This package is under development. The API may change without notice and | 
 | // without preserving backward compatibility. | 
 | package bidirule | 
 |  | 
 | import ( | 
 | 	"errors" | 
 | 	"unicode/utf8" | 
 |  | 
 | 	"golang.org/x/text/transform" | 
 | 	"golang.org/x/text/unicode/bidi" | 
 | ) | 
 |  | 
 | // This file contains an implementation of RFC 5893: Right-to-Left Scripts for | 
 | // Internationalized Domain Names for Applications (IDNA) | 
 | // | 
 | // A label is an individual component of a domain name.  Labels are usually | 
 | // shown separated by dots; for example, the domain name "www.example.com" is | 
 | // composed of three labels: "www", "example", and "com". | 
 | // | 
 | // An RTL label is a label that contains at least one character of class R, AL, | 
 | // or AN. An LTR label is any label that is not an RTL label. | 
 | // | 
 | // A "Bidi domain name" is a domain name that contains at least one RTL label. | 
 | // | 
 | //  The following guarantees can be made based on the above: | 
 | // | 
 | //  o  In a domain name consisting of only labels that satisfy the rule, | 
 | //     the requirements of Section 3 are satisfied.  Note that even LTR | 
 | //     labels and pure ASCII labels have to be tested. | 
 | // | 
 | //  o  In a domain name consisting of only LDH labels (as defined in the | 
 | //     Definitions document [RFC5890]) and labels that satisfy the rule, | 
 | //     the requirements of Section 3 are satisfied as long as a label | 
 | //     that starts with an ASCII digit does not come after a | 
 | //     right-to-left label. | 
 | // | 
 | //  No guarantee is given for other combinations. | 
 |  | 
 | // ErrInvalid indicates a label is invalid according to the Bidi Rule. | 
 | var ErrInvalid = errors.New("bidirule: failed Bidi Rule") | 
 |  | 
 | type ruleState uint8 | 
 |  | 
 | const ( | 
 | 	ruleInitial ruleState = iota | 
 | 	ruleLTR | 
 | 	ruleLTRFinal | 
 | 	ruleRTL | 
 | 	ruleRTLFinal | 
 | 	ruleInvalid | 
 | ) | 
 |  | 
 | type ruleTransition struct { | 
 | 	next ruleState | 
 | 	mask uint16 | 
 | } | 
 |  | 
 | var transitions = [...][2]ruleTransition{ | 
 | 	// [2.1] The first character must be a character with Bidi property L, R, or | 
 | 	// AL. If it has the R or AL property, it is an RTL label; if it has the L | 
 | 	// property, it is an LTR label. | 
 | 	ruleInitial: { | 
 | 		{ruleLTRFinal, 1 << bidi.L}, | 
 | 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL}, | 
 | 	}, | 
 | 	ruleRTL: { | 
 | 		// [2.3] In an RTL label, the end of the label must be a character with | 
 | 		// Bidi property R, AL, EN, or AN, followed by zero or more characters | 
 | 		// with Bidi property NSM. | 
 | 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN}, | 
 |  | 
 | 		// [2.2] In an RTL label, only characters with the Bidi properties R, | 
 | 		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. | 
 | 		// We exclude the entries from [2.3] | 
 | 		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM}, | 
 | 	}, | 
 | 	ruleRTLFinal: { | 
 | 		// [2.3] In an RTL label, the end of the label must be a character with | 
 | 		// Bidi property R, AL, EN, or AN, followed by zero or more characters | 
 | 		// with Bidi property NSM. | 
 | 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM}, | 
 |  | 
 | 		// [2.2] In an RTL label, only characters with the Bidi properties R, | 
 | 		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed. | 
 | 		// We exclude the entries from [2.3] and NSM. | 
 | 		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN}, | 
 | 	}, | 
 | 	ruleLTR: { | 
 | 		// [2.6] In an LTR label, the end of the label must be a character with | 
 | 		// Bidi property L or EN, followed by zero or more characters with Bidi | 
 | 		// property NSM. | 
 | 		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN}, | 
 |  | 
 | 		// [2.5] In an LTR label, only characters with the Bidi properties L, | 
 | 		// EN, ES, CS, ET, ON, BN, or NSM are allowed. | 
 | 		// We exclude the entries from [2.6]. | 
 | 		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM}, | 
 | 	}, | 
 | 	ruleLTRFinal: { | 
 | 		// [2.6] In an LTR label, the end of the label must be a character with | 
 | 		// Bidi property L or EN, followed by zero or more characters with Bidi | 
 | 		// property NSM. | 
 | 		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM}, | 
 |  | 
 | 		// [2.5] In an LTR label, only characters with the Bidi properties L, | 
 | 		// EN, ES, CS, ET, ON, BN, or NSM are allowed. | 
 | 		// We exclude the entries from [2.6]. | 
 | 		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN}, | 
 | 	}, | 
 | 	ruleInvalid: { | 
 | 		{ruleInvalid, 0}, | 
 | 		{ruleInvalid, 0}, | 
 | 	}, | 
 | } | 
 |  | 
 | // [2.4] In an RTL label, if an EN is present, no AN may be present, and | 
 | // vice versa. | 
 | const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN) | 
 |  | 
 | // From RFC 5893 | 
 | // An RTL label is a label that contains at least one character of type | 
 | // R, AL, or AN. | 
 | // | 
 | // An LTR label is any label that is not an RTL label. | 
 |  | 
 | // Direction reports the direction of the given label as defined by RFC 5893. | 
 | // The Bidi Rule does not have to be applied to labels of the category | 
 | // LeftToRight. | 
 | func Direction(b []byte) bidi.Direction { | 
 | 	for i := 0; i < len(b); { | 
 | 		e, sz := bidi.Lookup(b[i:]) | 
 | 		if sz == 0 { | 
 | 			i++ | 
 | 		} | 
 | 		c := e.Class() | 
 | 		if c == bidi.R || c == bidi.AL || c == bidi.AN { | 
 | 			return bidi.RightToLeft | 
 | 		} | 
 | 		i += sz | 
 | 	} | 
 | 	return bidi.LeftToRight | 
 | } | 
 |  | 
 | // DirectionString reports the direction of the given label as defined by RFC | 
 | // 5893. The Bidi Rule does not have to be applied to labels of the category | 
 | // LeftToRight. | 
 | func DirectionString(s string) bidi.Direction { | 
 | 	for i := 0; i < len(s); { | 
 | 		e, sz := bidi.LookupString(s[i:]) | 
 | 		if sz == 0 { | 
 | 			i++ | 
 | 			continue | 
 | 		} | 
 | 		c := e.Class() | 
 | 		if c == bidi.R || c == bidi.AL || c == bidi.AN { | 
 | 			return bidi.RightToLeft | 
 | 		} | 
 | 		i += sz | 
 | 	} | 
 | 	return bidi.LeftToRight | 
 | } | 
 |  | 
 | // Valid reports whether b conforms to the BiDi rule. | 
 | func Valid(b []byte) bool { | 
 | 	var t Transformer | 
 | 	if n, ok := t.advance(b); !ok || n < len(b) { | 
 | 		return false | 
 | 	} | 
 | 	return t.isFinal() | 
 | } | 
 |  | 
 | // ValidString reports whether s conforms to the BiDi rule. | 
 | func ValidString(s string) bool { | 
 | 	var t Transformer | 
 | 	if n, ok := t.advanceString(s); !ok || n < len(s) { | 
 | 		return false | 
 | 	} | 
 | 	return t.isFinal() | 
 | } | 
 |  | 
 | // New returns a Transformer that verifies that input adheres to the Bidi Rule. | 
 | func New() *Transformer { | 
 | 	return &Transformer{} | 
 | } | 
 |  | 
 | // Transformer implements transform.Transform. | 
 | type Transformer struct { | 
 | 	state  ruleState | 
 | 	hasRTL bool | 
 | 	seen   uint16 | 
 | } | 
 |  | 
 | // A rule can only be violated for "Bidi Domain names", meaning if one of the | 
 | // following categories has been observed. | 
 | func (t *Transformer) isRTL() bool { | 
 | 	const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN | 
 | 	return t.seen&isRTL != 0 | 
 | } | 
 |  | 
 | // Reset implements transform.Transformer. | 
 | func (t *Transformer) Reset() { *t = Transformer{} } | 
 |  | 
 | // Transform implements transform.Transformer. This Transformer has state and | 
 | // needs to be reset between uses. | 
 | func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { | 
 | 	if len(dst) < len(src) { | 
 | 		src = src[:len(dst)] | 
 | 		atEOF = false | 
 | 		err = transform.ErrShortDst | 
 | 	} | 
 | 	n, err1 := t.Span(src, atEOF) | 
 | 	copy(dst, src[:n]) | 
 | 	if err == nil || err1 != nil && err1 != transform.ErrShortSrc { | 
 | 		err = err1 | 
 | 	} | 
 | 	return n, n, err | 
 | } | 
 |  | 
 | // Span returns the first n bytes of src that conform to the Bidi rule. | 
 | func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) { | 
 | 	if t.state == ruleInvalid && t.isRTL() { | 
 | 		return 0, ErrInvalid | 
 | 	} | 
 | 	n, ok := t.advance(src) | 
 | 	switch { | 
 | 	case !ok: | 
 | 		err = ErrInvalid | 
 | 	case n < len(src): | 
 | 		if !atEOF { | 
 | 			err = transform.ErrShortSrc | 
 | 			break | 
 | 		} | 
 | 		err = ErrInvalid | 
 | 	case !t.isFinal(): | 
 | 		err = ErrInvalid | 
 | 	} | 
 | 	return n, err | 
 | } | 
 |  | 
 | // Precomputing the ASCII values decreases running time for the ASCII fast path | 
 | // by about 30%. | 
 | var asciiTable [128]bidi.Properties | 
 |  | 
 | func init() { | 
 | 	for i := range asciiTable { | 
 | 		p, _ := bidi.LookupRune(rune(i)) | 
 | 		asciiTable[i] = p | 
 | 	} | 
 | } | 
 |  | 
 | func (t *Transformer) advance(s []byte) (n int, ok bool) { | 
 | 	var e bidi.Properties | 
 | 	var sz int | 
 | 	for n < len(s) { | 
 | 		if s[n] < utf8.RuneSelf { | 
 | 			e, sz = asciiTable[s[n]], 1 | 
 | 		} else { | 
 | 			e, sz = bidi.Lookup(s[n:]) | 
 | 			if sz <= 1 { | 
 | 				if sz == 1 { | 
 | 					// We always consider invalid UTF-8 to be invalid, even if | 
 | 					// the string has not yet been determined to be RTL. | 
 | 					// TODO: is this correct? | 
 | 					return n, false | 
 | 				} | 
 | 				return n, true // incomplete UTF-8 encoding | 
 | 			} | 
 | 		} | 
 | 		// TODO: using CompactClass would result in noticeable speedup. | 
 | 		// See unicode/bidi/prop.go:Properties.CompactClass. | 
 | 		c := uint16(1 << e.Class()) | 
 | 		t.seen |= c | 
 | 		if t.seen&exclusiveRTL == exclusiveRTL { | 
 | 			t.state = ruleInvalid | 
 | 			return n, false | 
 | 		} | 
 | 		switch tr := transitions[t.state]; { | 
 | 		case tr[0].mask&c != 0: | 
 | 			t.state = tr[0].next | 
 | 		case tr[1].mask&c != 0: | 
 | 			t.state = tr[1].next | 
 | 		default: | 
 | 			t.state = ruleInvalid | 
 | 			if t.isRTL() { | 
 | 				return n, false | 
 | 			} | 
 | 		} | 
 | 		n += sz | 
 | 	} | 
 | 	return n, true | 
 | } | 
 |  | 
 | func (t *Transformer) advanceString(s string) (n int, ok bool) { | 
 | 	var e bidi.Properties | 
 | 	var sz int | 
 | 	for n < len(s) { | 
 | 		if s[n] < utf8.RuneSelf { | 
 | 			e, sz = asciiTable[s[n]], 1 | 
 | 		} else { | 
 | 			e, sz = bidi.LookupString(s[n:]) | 
 | 			if sz <= 1 { | 
 | 				if sz == 1 { | 
 | 					return n, false // invalid UTF-8 | 
 | 				} | 
 | 				return n, true // incomplete UTF-8 encoding | 
 | 			} | 
 | 		} | 
 | 		// TODO: using CompactClass results in noticeable speedup. | 
 | 		// See unicode/bidi/prop.go:Properties.CompactClass. | 
 | 		c := uint16(1 << e.Class()) | 
 | 		t.seen |= c | 
 | 		if t.seen&exclusiveRTL == exclusiveRTL { | 
 | 			t.state = ruleInvalid | 
 | 			return n, false | 
 | 		} | 
 | 		switch tr := transitions[t.state]; { | 
 | 		case tr[0].mask&c != 0: | 
 | 			t.state = tr[0].next | 
 | 		case tr[1].mask&c != 0: | 
 | 			t.state = tr[1].next | 
 | 		default: | 
 | 			t.state = ruleInvalid | 
 | 			if t.isRTL() { | 
 | 				return n, false | 
 | 			} | 
 | 		} | 
 | 		n += sz | 
 | 	} | 
 | 	return n, true | 
 | } |