blob: 9ac37184d69aa4fd308db2280cfc1b343a779c7f [file] [log] [blame]
Russ Cox5169bb42008-11-21 16:13:31 -08001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Nigel Tao6a186d32011-04-20 09:57:05 +10005// Package utf8 implements functions and constants to support text encoded in
Rob Pike6c0aa2f2012-02-10 14:12:17 +11006// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
Russ Cox5169bb42008-11-21 16:13:31 -08007package utf8
8
Rob Piked9832982012-03-06 11:14:45 +11009// The conditions RuneError==unicode.ReplacementChar and
10// MaxRune==unicode.MaxRune are verified in the tests.
11// Defining them locally avoids this package depending on package unicode.
Rob Pike149e3d32009-08-31 13:01:25 -070012
Rob Pikedfe08532009-03-05 19:15:13 -080013// Numbers fundamental to the encoding.
Russ Cox839a6842009-01-20 14:40:40 -080014const (
Rob Piked9832982012-03-06 11:14:45 +110015 RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
16 RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte.
17 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
18 UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
Russ Cox5169bb42008-11-21 16:13:31 -080019)
20
Rob Pikec48b77b2012-08-08 14:01:23 -070021// Code points in the surrogate range are not valid for UTF-8.
22const (
23 surrogateMin = 0xD800
24 surrogateMax = 0xDFFF
25)
26
Russ Cox5169bb42008-11-21 16:13:31 -080027const (
Rob Pike3cca9e02011-08-12 11:50:46 +100028 t1 = 0x00 // 0000 0000
29 tx = 0x80 // 1000 0000
30 t2 = 0xC0 // 1100 0000
31 t3 = 0xE0 // 1110 0000
32 t4 = 0xF0 // 1111 0000
33 t5 = 0xF8 // 1111 1000
Russ Cox5169bb42008-11-21 16:13:31 -080034
Rob Pike3cca9e02011-08-12 11:50:46 +100035 maskx = 0x3F // 0011 1111
36 mask2 = 0x1F // 0001 1111
37 mask3 = 0x0F // 0000 1111
38 mask4 = 0x07 // 0000 0111
Russ Cox5169bb42008-11-21 16:13:31 -080039
Rob Pike3cca9e02011-08-12 11:50:46 +100040 rune1Max = 1<<7 - 1
41 rune2Max = 1<<11 - 1
42 rune3Max = 1<<16 - 1
Russ Cox5169bb42008-11-21 16:13:31 -080043)
44
Russ Cox7630a102011-10-25 22:23:15 -070045func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -080046 n := len(p)
Russ Cox387df5e2008-11-24 14:51:33 -080047 if n < 1 {
Robert Griesemer40621d52009-11-09 12:07:39 -080048 return RuneError, 0, true
Russ Cox5169bb42008-11-21 16:13:31 -080049 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -080050 c0 := p[0]
Russ Cox5169bb42008-11-21 16:13:31 -080051
52 // 1-byte, 7-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +100053 if c0 < tx {
Russ Cox7630a102011-10-25 22:23:15 -070054 return rune(c0), 1, false
Russ Cox5169bb42008-11-21 16:13:31 -080055 }
56
57 // unexpected continuation byte?
Rob Pike3cca9e02011-08-12 11:50:46 +100058 if c0 < t2 {
Robert Griesemer40621d52009-11-09 12:07:39 -080059 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -080060 }
61
62 // need first continuation byte
Russ Cox387df5e2008-11-24 14:51:33 -080063 if n < 2 {
Robert Griesemer40621d52009-11-09 12:07:39 -080064 return RuneError, 1, true
Russ Cox5169bb42008-11-21 16:13:31 -080065 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -080066 c1 := p[1]
Rob Pike3cca9e02011-08-12 11:50:46 +100067 if c1 < tx || t2 <= c1 {
Robert Griesemer40621d52009-11-09 12:07:39 -080068 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -080069 }
70
71 // 2-byte, 11-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +100072 if c0 < t3 {
Russ Cox7630a102011-10-25 22:23:15 -070073 r = rune(c0&mask2)<<6 | rune(c1&maskx)
74 if r <= rune1Max {
Robert Griesemer40621d52009-11-09 12:07:39 -080075 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -080076 }
Russ Cox7630a102011-10-25 22:23:15 -070077 return r, 2, false
Russ Cox5169bb42008-11-21 16:13:31 -080078 }
79
80 // need second continuation byte
Russ Cox387df5e2008-11-24 14:51:33 -080081 if n < 3 {
Robert Griesemer40621d52009-11-09 12:07:39 -080082 return RuneError, 1, true
Russ Cox5169bb42008-11-21 16:13:31 -080083 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -080084 c2 := p[2]
Rob Pike3cca9e02011-08-12 11:50:46 +100085 if c2 < tx || t2 <= c2 {
Robert Griesemer40621d52009-11-09 12:07:39 -080086 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -080087 }
88
89 // 3-byte, 16-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +100090 if c0 < t4 {
Russ Cox7630a102011-10-25 22:23:15 -070091 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
92 if r <= rune2Max {
Robert Griesemer40621d52009-11-09 12:07:39 -080093 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -080094 }
Rob Pikec48b77b2012-08-08 14:01:23 -070095 if surrogateMin <= r && r <= surrogateMax {
96 return RuneError, 1, false
97 }
Russ Cox7630a102011-10-25 22:23:15 -070098 return r, 3, false
Russ Cox5169bb42008-11-21 16:13:31 -080099 }
100
101 // need third continuation byte
Russ Cox387df5e2008-11-24 14:51:33 -0800102 if n < 4 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800103 return RuneError, 1, true
Russ Cox5169bb42008-11-21 16:13:31 -0800104 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800105 c3 := p[3]
Rob Pike3cca9e02011-08-12 11:50:46 +1000106 if c3 < tx || t2 <= c3 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800107 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -0800108 }
109
110 // 4-byte, 21-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +1000111 if c0 < t5 {
Russ Cox7630a102011-10-25 22:23:15 -0700112 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
Rob Pikefc360f22012-07-19 11:58:14 -0700113 if r <= rune3Max || MaxRune < r {
Robert Griesemer40621d52009-11-09 12:07:39 -0800114 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -0800115 }
Russ Cox7630a102011-10-25 22:23:15 -0700116 return r, 4, false
Russ Cox5169bb42008-11-21 16:13:31 -0800117 }
118
119 // error
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800120 return RuneError, 1, false
Russ Cox5169bb42008-11-21 16:13:31 -0800121}
122
Russ Cox7630a102011-10-25 22:23:15 -0700123func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800124 n := len(s)
Russ Cox387df5e2008-11-24 14:51:33 -0800125 if n < 1 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800126 return RuneError, 0, true
Russ Cox387df5e2008-11-24 14:51:33 -0800127 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800128 c0 := s[0]
Russ Cox387df5e2008-11-24 14:51:33 -0800129
130 // 1-byte, 7-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +1000131 if c0 < tx {
Russ Cox7630a102011-10-25 22:23:15 -0700132 return rune(c0), 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800133 }
134
135 // unexpected continuation byte?
Rob Pike3cca9e02011-08-12 11:50:46 +1000136 if c0 < t2 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800137 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800138 }
139
140 // need first continuation byte
141 if n < 2 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800142 return RuneError, 1, true
Russ Cox387df5e2008-11-24 14:51:33 -0800143 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800144 c1 := s[1]
Rob Pike3cca9e02011-08-12 11:50:46 +1000145 if c1 < tx || t2 <= c1 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800146 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800147 }
148
149 // 2-byte, 11-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +1000150 if c0 < t3 {
Russ Cox7630a102011-10-25 22:23:15 -0700151 r = rune(c0&mask2)<<6 | rune(c1&maskx)
152 if r <= rune1Max {
Robert Griesemer40621d52009-11-09 12:07:39 -0800153 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800154 }
Russ Cox7630a102011-10-25 22:23:15 -0700155 return r, 2, false
Russ Cox387df5e2008-11-24 14:51:33 -0800156 }
157
158 // need second continuation byte
159 if n < 3 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800160 return RuneError, 1, true
Russ Cox387df5e2008-11-24 14:51:33 -0800161 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800162 c2 := s[2]
Rob Pike3cca9e02011-08-12 11:50:46 +1000163 if c2 < tx || t2 <= c2 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800164 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800165 }
166
167 // 3-byte, 16-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +1000168 if c0 < t4 {
Russ Cox7630a102011-10-25 22:23:15 -0700169 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
170 if r <= rune2Max {
Robert Griesemer40621d52009-11-09 12:07:39 -0800171 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800172 }
Rob Pikec48b77b2012-08-08 14:01:23 -0700173 if surrogateMin <= r && r <= surrogateMax {
174 return RuneError, 1, false
175 }
Russ Cox7630a102011-10-25 22:23:15 -0700176 return r, 3, false
Russ Cox387df5e2008-11-24 14:51:33 -0800177 }
178
179 // need third continuation byte
180 if n < 4 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800181 return RuneError, 1, true
Russ Cox387df5e2008-11-24 14:51:33 -0800182 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800183 c3 := s[3]
Rob Pike3cca9e02011-08-12 11:50:46 +1000184 if c3 < tx || t2 <= c3 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800185 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800186 }
187
188 // 4-byte, 21-bit sequence?
Rob Pike3cca9e02011-08-12 11:50:46 +1000189 if c0 < t5 {
Russ Cox7630a102011-10-25 22:23:15 -0700190 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
Rob Pikefc360f22012-07-19 11:58:14 -0700191 if r <= rune3Max || MaxRune < r {
Robert Griesemer40621d52009-11-09 12:07:39 -0800192 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800193 }
Russ Cox7630a102011-10-25 22:23:15 -0700194 return r, 4, false
Russ Cox387df5e2008-11-24 14:51:33 -0800195 }
196
197 // error
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800198 return RuneError, 1, false
Russ Cox387df5e2008-11-24 14:51:33 -0800199}
200
Rob Pikedfe08532009-03-05 19:15:13 -0800201// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
202// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
Russ Cox839a6842009-01-20 14:40:40 -0800203func FullRune(p []byte) bool {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800204 _, _, short := decodeRuneInternal(p)
205 return !short
Russ Cox5169bb42008-11-21 16:13:31 -0800206}
207
Rob Pikedfe08532009-03-05 19:15:13 -0800208// FullRuneInString is like FullRune but its input is a string.
Russ Cox3619f1e2009-05-11 14:10:34 -0700209func FullRuneInString(s string) bool {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800210 _, _, short := decodeRuneInStringInternal(s)
211 return !short
Russ Cox387df5e2008-11-24 14:51:33 -0800212}
213
Nigel Tao2dcb6132014-10-16 09:13:50 +1100214// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
215// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
216// the encoding is invalid, it returns (RuneError, 1). Both are impossible
217// results for correct UTF-8.
218//
Rob Pikefc360f22012-07-19 11:58:14 -0700219// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
220// out of range, or is not the shortest possible UTF-8 encoding for the
221// value. No other validation is performed.
Russ Cox7630a102011-10-25 22:23:15 -0700222func DecodeRune(p []byte) (r rune, size int) {
223 r, size, _ = decodeRuneInternal(p)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800224 return
Russ Cox5169bb42008-11-21 16:13:31 -0800225}
226
Nigel Tao2dcb6132014-10-16 09:13:50 +1100227// DecodeRuneInString is like DecodeRune but its input is a string. If s is
228// empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
229// returns (RuneError, 1). Both are impossible results for correct UTF-8.
230//
Rob Pikefc360f22012-07-19 11:58:14 -0700231// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
232// out of range, or is not the shortest possible UTF-8 encoding for the
233// value. No other validation is performed.
Russ Cox7630a102011-10-25 22:23:15 -0700234func DecodeRuneInString(s string) (r rune, size int) {
235 r, size, _ = decodeRuneInStringInternal(s)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800236 return
Russ Cox387df5e2008-11-24 14:51:33 -0800237}
238
Nigel Tao2dcb6132014-10-16 09:13:50 +1100239// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
240// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
241// the encoding is invalid, it returns (RuneError, 1). Both are impossible
242// results for correct UTF-8.
243//
Rob Pikefc360f22012-07-19 11:58:14 -0700244// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
245// out of range, or is not the shortest possible UTF-8 encoding for the
246// value. No other validation is performed.
Russ Cox7630a102011-10-25 22:23:15 -0700247func DecodeLastRune(p []byte) (r rune, size int) {
Roger Peppef11271b2010-09-23 20:33:52 +1000248 end := len(p)
249 if end == 0 {
250 return RuneError, 0
251 }
252 start := end - 1
Russ Cox7630a102011-10-25 22:23:15 -0700253 r = rune(p[start])
254 if r < RuneSelf {
255 return r, 1
Roger Peppef11271b2010-09-23 20:33:52 +1000256 }
257 // guard against O(n^2) behavior when traversing
258 // backwards through strings with long sequences of
259 // invalid UTF-8.
260 lim := end - UTFMax
261 if lim < 0 {
262 lim = 0
263 }
264 for start--; start >= lim; start-- {
265 if RuneStart(p[start]) {
266 break
267 }
268 }
269 if start < 0 {
270 start = 0
271 }
Russ Cox7630a102011-10-25 22:23:15 -0700272 r, size = DecodeRune(p[start:end])
Roger Peppef11271b2010-09-23 20:33:52 +1000273 if start+size != end {
274 return RuneError, 1
275 }
Russ Cox7630a102011-10-25 22:23:15 -0700276 return r, size
Roger Peppef11271b2010-09-23 20:33:52 +1000277}
278
Nigel Tao2dcb6132014-10-16 09:13:50 +1100279// DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
280// s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
281// it returns (RuneError, 1). Both are impossible results for correct UTF-8.
282//
Rob Pikefc360f22012-07-19 11:58:14 -0700283// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
284// out of range, or is not the shortest possible UTF-8 encoding for the
285// value. No other validation is performed.
Russ Cox7630a102011-10-25 22:23:15 -0700286func DecodeLastRuneInString(s string) (r rune, size int) {
Roger Peppef11271b2010-09-23 20:33:52 +1000287 end := len(s)
288 if end == 0 {
289 return RuneError, 0
290 }
291 start := end - 1
Russ Cox7630a102011-10-25 22:23:15 -0700292 r = rune(s[start])
293 if r < RuneSelf {
294 return r, 1
Roger Peppef11271b2010-09-23 20:33:52 +1000295 }
296 // guard against O(n^2) behavior when traversing
297 // backwards through strings with long sequences of
298 // invalid UTF-8.
299 lim := end - UTFMax
300 if lim < 0 {
301 lim = 0
302 }
303 for start--; start >= lim; start-- {
304 if RuneStart(s[start]) {
305 break
306 }
307 }
308 if start < 0 {
309 start = 0
310 }
Russ Cox7630a102011-10-25 22:23:15 -0700311 r, size = DecodeRuneInString(s[start:end])
Roger Peppef11271b2010-09-23 20:33:52 +1000312 if start+size != end {
313 return RuneError, 1
314 }
Russ Cox7630a102011-10-25 22:23:15 -0700315 return r, size
Roger Peppef11271b2010-09-23 20:33:52 +1000316}
317
Rob Pikedfe08532009-03-05 19:15:13 -0800318// RuneLen returns the number of bytes required to encode the rune.
Rob Pikec48b77b2012-08-08 14:01:23 -0700319// It returns -1 if the rune is not a valid value to encode in UTF-8.
Russ Cox7630a102011-10-25 22:23:15 -0700320func RuneLen(r rune) int {
Russ Cox5169bb42008-11-21 16:13:31 -0800321 switch {
Rob Pikec48b77b2012-08-08 14:01:23 -0700322 case r < 0:
323 return -1
Russ Cox7630a102011-10-25 22:23:15 -0700324 case r <= rune1Max:
Robert Griesemer40621d52009-11-09 12:07:39 -0800325 return 1
Russ Cox7630a102011-10-25 22:23:15 -0700326 case r <= rune2Max:
Robert Griesemer40621d52009-11-09 12:07:39 -0800327 return 2
Rob Pikec48b77b2012-08-08 14:01:23 -0700328 case surrogateMin <= r && r <= surrogateMax:
329 return -1
Russ Cox7630a102011-10-25 22:23:15 -0700330 case r <= rune3Max:
Robert Griesemer40621d52009-11-09 12:07:39 -0800331 return 3
Rob Pikec48b77b2012-08-08 14:01:23 -0700332 case r <= MaxRune:
Robert Griesemer40621d52009-11-09 12:07:39 -0800333 return 4
Russ Cox5169bb42008-11-21 16:13:31 -0800334 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800335 return -1
Russ Cox5169bb42008-11-21 16:13:31 -0800336}
337
Rob Pikedfe08532009-03-05 19:15:13 -0800338// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
339// It returns the number of bytes written.
Russ Cox7630a102011-10-25 22:23:15 -0700340func EncodeRune(p []byte, r rune) int {
Rob Pike0d3f5a82009-12-15 09:31:24 +1100341 // Negative values are erroneous. Making it unsigned addresses the problem.
Rui Ueyama446d90d2014-03-23 15:44:29 -0700342 switch i := uint32(r); {
343 case i <= rune1Max:
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800344 p[0] = byte(r)
345 return 1
Rui Ueyama446d90d2014-03-23 15:44:29 -0700346 case i <= rune2Max:
Rob Pike3cca9e02011-08-12 11:50:46 +1000347 p[0] = t2 | byte(r>>6)
348 p[1] = tx | byte(r)&maskx
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800349 return 2
Rui Ueyama446d90d2014-03-23 15:44:29 -0700350 case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
Rob Pike0d3f5a82009-12-15 09:31:24 +1100351 r = RuneError
Rui Ueyama446d90d2014-03-23 15:44:29 -0700352 fallthrough
353 case i <= rune3Max:
Rob Pike3cca9e02011-08-12 11:50:46 +1000354 p[0] = t3 | byte(r>>12)
355 p[1] = tx | byte(r>>6)&maskx
356 p[2] = tx | byte(r)&maskx
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800357 return 3
Rui Ueyama446d90d2014-03-23 15:44:29 -0700358 default:
359 p[0] = t4 | byte(r>>18)
360 p[1] = tx | byte(r>>12)&maskx
361 p[2] = tx | byte(r>>6)&maskx
362 p[3] = tx | byte(r)&maskx
363 return 4
Russ Cox5169bb42008-11-21 16:13:31 -0800364 }
Russ Cox5169bb42008-11-21 16:13:31 -0800365}
366
Rob Pikedfe08532009-03-05 19:15:13 -0800367// RuneCount returns the number of runes in p. Erroneous and short
368// encodings are treated as single runes of width 1 byte.
Russ Cox839a6842009-01-20 14:40:40 -0800369func RuneCount(p []byte) int {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800370 i := 0
371 var n int
Russ Cox0d1cbaf22008-12-04 21:00:34 -0800372 for n = 0; i < len(p); n++ {
373 if p[i] < RuneSelf {
Robert Griesemer40621d52009-11-09 12:07:39 -0800374 i++
Russ Cox0d1cbaf22008-12-04 21:00:34 -0800375 } else {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800376 _, size := DecodeRune(p[i:])
377 i += size
Russ Cox0d1cbaf22008-12-04 21:00:34 -0800378 }
379 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800380 return n
Russ Cox0d1cbaf22008-12-04 21:00:34 -0800381}
382
Rob Pikedfe08532009-03-05 19:15:13 -0800383// RuneCountInString is like RuneCount but its input is a string.
Rob Pike773e7792009-11-25 11:39:34 -0800384func RuneCountInString(s string) (n int) {
Robert Griesemer8a23c002014-07-16 16:29:51 -0700385 for range s {
Rob Pike773e7792009-11-25 11:39:34 -0800386 n++
Russ Cox0d1cbaf22008-12-04 21:00:34 -0800387 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800388 return
Russ Cox0d1cbaf22008-12-04 21:00:34 -0800389}
390
Rob Piked80a1772009-09-01 11:06:28 -0700391// RuneStart reports whether the byte could be the first byte of
392// an encoded rune. Second and subsequent bytes always have the top
393// two bits set to 10.
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800394func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
Brad Fitzpatrick2b95cfb2011-10-06 22:47:24 -0700395
396// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
397func Valid(p []byte) bool {
398 i := 0
399 for i < len(p) {
400 if p[i] < RuneSelf {
401 i++
402 } else {
403 _, size := DecodeRune(p[i:])
404 if size == 1 {
Andrey Mirtchovskibe36ab32013-01-09 11:07:13 -0800405 // All valid runes of size 1 (those
Brad Fitzpatrick2b95cfb2011-10-06 22:47:24 -0700406 // below RuneSelf) were handled above.
407 // This must be a RuneError.
408 return false
409 }
410 i += size
411 }
412 }
413 return true
414}
415
416// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
417func ValidString(s string) bool {
418 for i, r := range s {
419 if r == RuneError {
420 // The RuneError value can be an error
421 // sentinel value (if it's size 1) or the same
422 // value encoded properly. Decode it to see if
423 // it's the 1 byte sentinel value.
424 _, size := DecodeRuneInString(s[i:])
425 if size == 1 {
426 return false
427 }
428 }
429 }
430 return true
431}
Rob Pikec48b77b2012-08-08 14:01:23 -0700432
433// ValidRune reports whether r can be legally encoded as UTF-8.
434// Code points that are out of range or a surrogate half are illegal.
435func ValidRune(r rune) bool {
436 switch {
437 case r < 0:
438 return false
439 case surrogateMin <= r && r <= surrogateMax:
440 return false
441 case r > MaxRune:
442 return false
443 }
444 return true
445}