Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 1 | // Copyright 2009 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
Nigel Tao | 6a186d3 | 2011-04-20 09:57:05 +1000 | [diff] [blame] | 5 | // Package utf8 implements functions and constants to support text encoded in |
Rob Pike | 6c0aa2f | 2012-02-10 14:12:17 +1100 | [diff] [blame] | 6 | // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 7 | package utf8 |
| 8 | |
Rob Pike | d983298 | 2012-03-06 11:14:45 +1100 | [diff] [blame] | 9 | // The conditions RuneError==unicode.ReplacementChar and |
| 10 | // MaxRune==unicode.MaxRune are verified in the tests. |
| 11 | // Defining them locally avoids this package depending on package unicode. |
Rob Pike | 149e3d3 | 2009-08-31 13:01:25 -0700 | [diff] [blame] | 12 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 13 | // Numbers fundamental to the encoding. |
Russ Cox | 839a684 | 2009-01-20 14:40:40 -0800 | [diff] [blame] | 14 | const ( |
Rob Pike | d983298 | 2012-03-06 11:14:45 +1100 | [diff] [blame] | 15 | RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" |
| 16 | RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. |
| 17 | MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. |
| 18 | UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 19 | ) |
| 20 | |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 21 | // Code points in the surrogate range are not valid for UTF-8. |
| 22 | const ( |
| 23 | surrogateMin = 0xD800 |
| 24 | surrogateMax = 0xDFFF |
| 25 | ) |
| 26 | |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 27 | const ( |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 28 | t1 = 0x00 // 0000 0000 |
| 29 | tx = 0x80 // 1000 0000 |
| 30 | t2 = 0xC0 // 1100 0000 |
| 31 | t3 = 0xE0 // 1110 0000 |
| 32 | t4 = 0xF0 // 1111 0000 |
| 33 | t5 = 0xF8 // 1111 1000 |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 34 | |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 35 | maskx = 0x3F // 0011 1111 |
| 36 | mask2 = 0x1F // 0001 1111 |
| 37 | mask3 = 0x0F // 0000 1111 |
| 38 | mask4 = 0x07 // 0000 0111 |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 39 | |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 40 | rune1Max = 1<<7 - 1 |
| 41 | rune2Max = 1<<11 - 1 |
| 42 | rune3Max = 1<<16 - 1 |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 43 | ) |
| 44 | |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 45 | func decodeRuneInternal(p []byte) (r rune, size int, short bool) { |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 46 | n := len(p) |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 47 | if n < 1 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 48 | return RuneError, 0, true |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 49 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 50 | c0 := p[0] |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 51 | |
| 52 | // 1-byte, 7-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 53 | if c0 < tx { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 54 | return rune(c0), 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 55 | } |
| 56 | |
| 57 | // unexpected continuation byte? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 58 | if c0 < t2 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 59 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 60 | } |
| 61 | |
| 62 | // need first continuation byte |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 63 | if n < 2 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 64 | return RuneError, 1, true |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 65 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 66 | c1 := p[1] |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 67 | if c1 < tx || t2 <= c1 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 68 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 69 | } |
| 70 | |
| 71 | // 2-byte, 11-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 72 | if c0 < t3 { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 73 | r = rune(c0&mask2)<<6 | rune(c1&maskx) |
| 74 | if r <= rune1Max { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 75 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 76 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 77 | return r, 2, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 78 | } |
| 79 | |
| 80 | // need second continuation byte |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 81 | if n < 3 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 82 | return RuneError, 1, true |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 83 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 84 | c2 := p[2] |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 85 | if c2 < tx || t2 <= c2 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 86 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 87 | } |
| 88 | |
| 89 | // 3-byte, 16-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 90 | if c0 < t4 { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 91 | r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) |
| 92 | if r <= rune2Max { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 93 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 94 | } |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 95 | if surrogateMin <= r && r <= surrogateMax { |
| 96 | return RuneError, 1, false |
| 97 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 98 | return r, 3, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 99 | } |
| 100 | |
| 101 | // need third continuation byte |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 102 | if n < 4 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 103 | return RuneError, 1, true |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 104 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 105 | c3 := p[3] |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 106 | if c3 < tx || t2 <= c3 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 107 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 108 | } |
| 109 | |
| 110 | // 4-byte, 21-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 111 | if c0 < t5 { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 112 | r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) |
Rob Pike | fc360f2 | 2012-07-19 11:58:14 -0700 | [diff] [blame] | 113 | if r <= rune3Max || MaxRune < r { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 114 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 115 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 116 | return r, 4, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 117 | } |
| 118 | |
| 119 | // error |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 120 | return RuneError, 1, false |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 121 | } |
| 122 | |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 123 | func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 124 | n := len(s) |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 125 | if n < 1 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 126 | return RuneError, 0, true |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 127 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 128 | c0 := s[0] |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 129 | |
| 130 | // 1-byte, 7-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 131 | if c0 < tx { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 132 | return rune(c0), 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 133 | } |
| 134 | |
| 135 | // unexpected continuation byte? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 136 | if c0 < t2 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 137 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 138 | } |
| 139 | |
| 140 | // need first continuation byte |
| 141 | if n < 2 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 142 | return RuneError, 1, true |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 143 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 144 | c1 := s[1] |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 145 | if c1 < tx || t2 <= c1 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 146 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 147 | } |
| 148 | |
| 149 | // 2-byte, 11-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 150 | if c0 < t3 { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 151 | r = rune(c0&mask2)<<6 | rune(c1&maskx) |
| 152 | if r <= rune1Max { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 153 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 154 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 155 | return r, 2, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 156 | } |
| 157 | |
| 158 | // need second continuation byte |
| 159 | if n < 3 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 160 | return RuneError, 1, true |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 161 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 162 | c2 := s[2] |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 163 | if c2 < tx || t2 <= c2 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 164 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 165 | } |
| 166 | |
| 167 | // 3-byte, 16-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 168 | if c0 < t4 { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 169 | r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) |
| 170 | if r <= rune2Max { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 171 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 172 | } |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 173 | if surrogateMin <= r && r <= surrogateMax { |
| 174 | return RuneError, 1, false |
| 175 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 176 | return r, 3, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 177 | } |
| 178 | |
| 179 | // need third continuation byte |
| 180 | if n < 4 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 181 | return RuneError, 1, true |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 182 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 183 | c3 := s[3] |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 184 | if c3 < tx || t2 <= c3 { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 185 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 186 | } |
| 187 | |
| 188 | // 4-byte, 21-bit sequence? |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 189 | if c0 < t5 { |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 190 | r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) |
Rob Pike | fc360f2 | 2012-07-19 11:58:14 -0700 | [diff] [blame] | 191 | if r <= rune3Max || MaxRune < r { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 192 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 193 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 194 | return r, 4, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 195 | } |
| 196 | |
| 197 | // error |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 198 | return RuneError, 1, false |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 199 | } |
| 200 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 201 | // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. |
| 202 | // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. |
Russ Cox | 839a684 | 2009-01-20 14:40:40 -0800 | [diff] [blame] | 203 | func FullRune(p []byte) bool { |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 204 | _, _, short := decodeRuneInternal(p) |
| 205 | return !short |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 206 | } |
| 207 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 208 | // FullRuneInString is like FullRune but its input is a string. |
Russ Cox | 3619f1e | 2009-05-11 14:10:34 -0700 | [diff] [blame] | 209 | func FullRuneInString(s string) bool { |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 210 | _, _, short := decodeRuneInStringInternal(s) |
| 211 | return !short |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 212 | } |
| 213 | |
Nigel Tao | 2dcb613 | 2014-10-16 09:13:50 +1100 | [diff] [blame] | 214 | // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and |
| 215 | // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if |
| 216 | // the encoding is invalid, it returns (RuneError, 1). Both are impossible |
| 217 | // results for correct UTF-8. |
| 218 | // |
Rob Pike | fc360f2 | 2012-07-19 11:58:14 -0700 | [diff] [blame] | 219 | // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is |
| 220 | // out of range, or is not the shortest possible UTF-8 encoding for the |
| 221 | // value. No other validation is performed. |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 222 | func DecodeRune(p []byte) (r rune, size int) { |
| 223 | r, size, _ = decodeRuneInternal(p) |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 224 | return |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 225 | } |
| 226 | |
Nigel Tao | 2dcb613 | 2014-10-16 09:13:50 +1100 | [diff] [blame] | 227 | // DecodeRuneInString is like DecodeRune but its input is a string. If s is |
| 228 | // empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it |
| 229 | // returns (RuneError, 1). Both are impossible results for correct UTF-8. |
| 230 | // |
Rob Pike | fc360f2 | 2012-07-19 11:58:14 -0700 | [diff] [blame] | 231 | // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is |
| 232 | // out of range, or is not the shortest possible UTF-8 encoding for the |
| 233 | // value. No other validation is performed. |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 234 | func DecodeRuneInString(s string) (r rune, size int) { |
| 235 | r, size, _ = decodeRuneInStringInternal(s) |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 236 | return |
Russ Cox | 387df5e | 2008-11-24 14:51:33 -0800 | [diff] [blame] | 237 | } |
| 238 | |
Nigel Tao | 2dcb613 | 2014-10-16 09:13:50 +1100 | [diff] [blame] | 239 | // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and |
| 240 | // its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if |
| 241 | // the encoding is invalid, it returns (RuneError, 1). Both are impossible |
| 242 | // results for correct UTF-8. |
| 243 | // |
Rob Pike | fc360f2 | 2012-07-19 11:58:14 -0700 | [diff] [blame] | 244 | // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is |
| 245 | // out of range, or is not the shortest possible UTF-8 encoding for the |
| 246 | // value. No other validation is performed. |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 247 | func DecodeLastRune(p []byte) (r rune, size int) { |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 248 | end := len(p) |
| 249 | if end == 0 { |
| 250 | return RuneError, 0 |
| 251 | } |
| 252 | start := end - 1 |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 253 | r = rune(p[start]) |
| 254 | if r < RuneSelf { |
| 255 | return r, 1 |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 256 | } |
| 257 | // guard against O(n^2) behavior when traversing |
| 258 | // backwards through strings with long sequences of |
| 259 | // invalid UTF-8. |
| 260 | lim := end - UTFMax |
| 261 | if lim < 0 { |
| 262 | lim = 0 |
| 263 | } |
| 264 | for start--; start >= lim; start-- { |
| 265 | if RuneStart(p[start]) { |
| 266 | break |
| 267 | } |
| 268 | } |
| 269 | if start < 0 { |
| 270 | start = 0 |
| 271 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 272 | r, size = DecodeRune(p[start:end]) |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 273 | if start+size != end { |
| 274 | return RuneError, 1 |
| 275 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 276 | return r, size |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 277 | } |
| 278 | |
Nigel Tao | 2dcb613 | 2014-10-16 09:13:50 +1100 | [diff] [blame] | 279 | // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If |
| 280 | // s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, |
| 281 | // it returns (RuneError, 1). Both are impossible results for correct UTF-8. |
| 282 | // |
Rob Pike | fc360f2 | 2012-07-19 11:58:14 -0700 | [diff] [blame] | 283 | // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is |
| 284 | // out of range, or is not the shortest possible UTF-8 encoding for the |
| 285 | // value. No other validation is performed. |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 286 | func DecodeLastRuneInString(s string) (r rune, size int) { |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 287 | end := len(s) |
| 288 | if end == 0 { |
| 289 | return RuneError, 0 |
| 290 | } |
| 291 | start := end - 1 |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 292 | r = rune(s[start]) |
| 293 | if r < RuneSelf { |
| 294 | return r, 1 |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 295 | } |
| 296 | // guard against O(n^2) behavior when traversing |
| 297 | // backwards through strings with long sequences of |
| 298 | // invalid UTF-8. |
| 299 | lim := end - UTFMax |
| 300 | if lim < 0 { |
| 301 | lim = 0 |
| 302 | } |
| 303 | for start--; start >= lim; start-- { |
| 304 | if RuneStart(s[start]) { |
| 305 | break |
| 306 | } |
| 307 | } |
| 308 | if start < 0 { |
| 309 | start = 0 |
| 310 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 311 | r, size = DecodeRuneInString(s[start:end]) |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 312 | if start+size != end { |
| 313 | return RuneError, 1 |
| 314 | } |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 315 | return r, size |
Roger Peppe | f11271b | 2010-09-23 20:33:52 +1000 | [diff] [blame] | 316 | } |
| 317 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 318 | // RuneLen returns the number of bytes required to encode the rune. |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 319 | // It returns -1 if the rune is not a valid value to encode in UTF-8. |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 320 | func RuneLen(r rune) int { |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 321 | switch { |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 322 | case r < 0: |
| 323 | return -1 |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 324 | case r <= rune1Max: |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 325 | return 1 |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 326 | case r <= rune2Max: |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 327 | return 2 |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 328 | case surrogateMin <= r && r <= surrogateMax: |
| 329 | return -1 |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 330 | case r <= rune3Max: |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 331 | return 3 |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 332 | case r <= MaxRune: |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 333 | return 4 |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 334 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 335 | return -1 |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 336 | } |
| 337 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 338 | // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. |
| 339 | // It returns the number of bytes written. |
Russ Cox | 7630a10 | 2011-10-25 22:23:15 -0700 | [diff] [blame] | 340 | func EncodeRune(p []byte, r rune) int { |
Rob Pike | 0d3f5a8 | 2009-12-15 09:31:24 +1100 | [diff] [blame] | 341 | // Negative values are erroneous. Making it unsigned addresses the problem. |
Rui Ueyama | 446d90d | 2014-03-23 15:44:29 -0700 | [diff] [blame] | 342 | switch i := uint32(r); { |
| 343 | case i <= rune1Max: |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 344 | p[0] = byte(r) |
| 345 | return 1 |
Rui Ueyama | 446d90d | 2014-03-23 15:44:29 -0700 | [diff] [blame] | 346 | case i <= rune2Max: |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 347 | p[0] = t2 | byte(r>>6) |
| 348 | p[1] = tx | byte(r)&maskx |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 349 | return 2 |
Rui Ueyama | 446d90d | 2014-03-23 15:44:29 -0700 | [diff] [blame] | 350 | case i > MaxRune, surrogateMin <= i && i <= surrogateMax: |
Rob Pike | 0d3f5a8 | 2009-12-15 09:31:24 +1100 | [diff] [blame] | 351 | r = RuneError |
Rui Ueyama | 446d90d | 2014-03-23 15:44:29 -0700 | [diff] [blame] | 352 | fallthrough |
| 353 | case i <= rune3Max: |
Rob Pike | 3cca9e0 | 2011-08-12 11:50:46 +1000 | [diff] [blame] | 354 | p[0] = t3 | byte(r>>12) |
| 355 | p[1] = tx | byte(r>>6)&maskx |
| 356 | p[2] = tx | byte(r)&maskx |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 357 | return 3 |
Rui Ueyama | 446d90d | 2014-03-23 15:44:29 -0700 | [diff] [blame] | 358 | default: |
| 359 | p[0] = t4 | byte(r>>18) |
| 360 | p[1] = tx | byte(r>>12)&maskx |
| 361 | p[2] = tx | byte(r>>6)&maskx |
| 362 | p[3] = tx | byte(r)&maskx |
| 363 | return 4 |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 364 | } |
Russ Cox | 5169bb4 | 2008-11-21 16:13:31 -0800 | [diff] [blame] | 365 | } |
| 366 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 367 | // RuneCount returns the number of runes in p. Erroneous and short |
| 368 | // encodings are treated as single runes of width 1 byte. |
Russ Cox | 839a684 | 2009-01-20 14:40:40 -0800 | [diff] [blame] | 369 | func RuneCount(p []byte) int { |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 370 | i := 0 |
| 371 | var n int |
Russ Cox | 0d1cbaf2 | 2008-12-04 21:00:34 -0800 | [diff] [blame] | 372 | for n = 0; i < len(p); n++ { |
| 373 | if p[i] < RuneSelf { |
Robert Griesemer | 40621d5 | 2009-11-09 12:07:39 -0800 | [diff] [blame] | 374 | i++ |
Russ Cox | 0d1cbaf2 | 2008-12-04 21:00:34 -0800 | [diff] [blame] | 375 | } else { |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 376 | _, size := DecodeRune(p[i:]) |
| 377 | i += size |
Russ Cox | 0d1cbaf2 | 2008-12-04 21:00:34 -0800 | [diff] [blame] | 378 | } |
| 379 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 380 | return n |
Russ Cox | 0d1cbaf2 | 2008-12-04 21:00:34 -0800 | [diff] [blame] | 381 | } |
| 382 | |
Rob Pike | dfe0853 | 2009-03-05 19:15:13 -0800 | [diff] [blame] | 383 | // RuneCountInString is like RuneCount but its input is a string. |
Rob Pike | 773e779 | 2009-11-25 11:39:34 -0800 | [diff] [blame] | 384 | func RuneCountInString(s string) (n int) { |
Robert Griesemer | 8a23c00 | 2014-07-16 16:29:51 -0700 | [diff] [blame] | 385 | for range s { |
Rob Pike | 773e779 | 2009-11-25 11:39:34 -0800 | [diff] [blame] | 386 | n++ |
Russ Cox | 0d1cbaf2 | 2008-12-04 21:00:34 -0800 | [diff] [blame] | 387 | } |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 388 | return |
Russ Cox | 0d1cbaf2 | 2008-12-04 21:00:34 -0800 | [diff] [blame] | 389 | } |
| 390 | |
Rob Pike | d80a177 | 2009-09-01 11:06:28 -0700 | [diff] [blame] | 391 | // RuneStart reports whether the byte could be the first byte of |
| 392 | // an encoded rune. Second and subsequent bytes always have the top |
| 393 | // two bits set to 10. |
Robert Griesemer | 45ca9f7 | 2009-12-15 15:41:46 -0800 | [diff] [blame] | 394 | func RuneStart(b byte) bool { return b&0xC0 != 0x80 } |
Brad Fitzpatrick | 2b95cfb | 2011-10-06 22:47:24 -0700 | [diff] [blame] | 395 | |
| 396 | // Valid reports whether p consists entirely of valid UTF-8-encoded runes. |
| 397 | func Valid(p []byte) bool { |
| 398 | i := 0 |
| 399 | for i < len(p) { |
| 400 | if p[i] < RuneSelf { |
| 401 | i++ |
| 402 | } else { |
| 403 | _, size := DecodeRune(p[i:]) |
| 404 | if size == 1 { |
Andrey Mirtchovski | be36ab3 | 2013-01-09 11:07:13 -0800 | [diff] [blame] | 405 | // All valid runes of size 1 (those |
Brad Fitzpatrick | 2b95cfb | 2011-10-06 22:47:24 -0700 | [diff] [blame] | 406 | // below RuneSelf) were handled above. |
| 407 | // This must be a RuneError. |
| 408 | return false |
| 409 | } |
| 410 | i += size |
| 411 | } |
| 412 | } |
| 413 | return true |
| 414 | } |
| 415 | |
| 416 | // ValidString reports whether s consists entirely of valid UTF-8-encoded runes. |
| 417 | func ValidString(s string) bool { |
| 418 | for i, r := range s { |
| 419 | if r == RuneError { |
| 420 | // The RuneError value can be an error |
| 421 | // sentinel value (if it's size 1) or the same |
| 422 | // value encoded properly. Decode it to see if |
| 423 | // it's the 1 byte sentinel value. |
| 424 | _, size := DecodeRuneInString(s[i:]) |
| 425 | if size == 1 { |
| 426 | return false |
| 427 | } |
| 428 | } |
| 429 | } |
| 430 | return true |
| 431 | } |
Rob Pike | c48b77b | 2012-08-08 14:01:23 -0700 | [diff] [blame] | 432 | |
| 433 | // ValidRune reports whether r can be legally encoded as UTF-8. |
| 434 | // Code points that are out of range or a surrogate half are illegal. |
| 435 | func ValidRune(r rune) bool { |
| 436 | switch { |
| 437 | case r < 0: |
| 438 | return false |
| 439 | case surrogateMin <= r && r <= surrogateMax: |
| 440 | return false |
| 441 | case r > MaxRune: |
| 442 | return false |
| 443 | } |
| 444 | return true |
| 445 | } |