Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | package main |
| 8 | |
| 9 | // This program generates table.go and table_test.go. |
| 10 | // Invoke as: |
| 11 | // |
| 12 | // go run gen.go -version "xxx" >table.go |
| 13 | // go run gen.go -version "xxx" -test >table_test.go |
| 14 | // |
| 15 | // The version is derived from information found at |
| 16 | // http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat |
| 17 | // which is linked from http://publicsuffix.org/list/. |
| 18 | // |
| 19 | // To fetch a particular hg revision, such as 05b11a8d1ace, pass |
| 20 | // -url "http://hg.mozilla.org/mozilla-central/raw-file/05b11a8d1ace/netwerk/dns/effective_tld_names.dat" |
| 21 | |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 22 | import ( |
| 23 | "bufio" |
| 24 | "bytes" |
| 25 | "flag" |
| 26 | "fmt" |
| 27 | "go/format" |
| 28 | "io" |
| 29 | "net/http" |
| 30 | "os" |
| 31 | "sort" |
| 32 | "strings" |
Nigel Tao | cbecf2f | 2012-12-20 19:36:00 +1100 | [diff] [blame] | 33 | |
| 34 | "code.google.com/p/go.net/idna" |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 35 | ) |
| 36 | |
| 37 | const ( |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 38 | nodesBitsChildren = 9 |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 39 | nodesBitsICANN = 1 |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 40 | nodesBitsTextOffset = 15 |
| 41 | nodesBitsTextLength = 6 |
| 42 | |
| 43 | childrenBitsWildcard = 1 |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 44 | childrenBitsNodeType = 2 |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 45 | childrenBitsHi = 14 |
| 46 | childrenBitsLo = 14 |
| 47 | ) |
| 48 | |
| 49 | const ( |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 50 | nodeTypeNormal = 0 |
| 51 | nodeTypeException = 1 |
| 52 | nodeTypeParentOnly = 2 |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 53 | numNodeType = 3 |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 54 | ) |
| 55 | |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 56 | func nodeTypeStr(n int) string { |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 57 | switch n { |
| 58 | case nodeTypeNormal: |
| 59 | return "+" |
| 60 | case nodeTypeException: |
| 61 | return "!" |
| 62 | case nodeTypeParentOnly: |
| 63 | return "o" |
| 64 | } |
| 65 | panic("unreachable") |
| 66 | } |
| 67 | |
| 68 | var ( |
| 69 | labelEncoding = map[string]uint32{} |
| 70 | labelsList = []string{} |
| 71 | labelsMap = map[string]bool{} |
| 72 | rules = []string{} |
| 73 | |
| 74 | crush = flag.Bool("crush", true, "make the generated node text as small as possible") |
| 75 | subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging") |
| 76 | url = flag.String("url", |
| 77 | "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1", |
| 78 | "URL of the publicsuffix.org list. If empty, stdin is read instead") |
| 79 | v = flag.Bool("v", false, "verbose output (to stderr)") |
| 80 | version = flag.String("version", "", "the effective_tld_names.dat version") |
| 81 | test = flag.Bool("test", false, "generate table_test.go") |
| 82 | ) |
| 83 | |
| 84 | func main() { |
| 85 | if err := main1(); err != nil { |
| 86 | fmt.Fprintln(os.Stderr, err) |
| 87 | os.Exit(1) |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | func main1() error { |
| 92 | flag.Parse() |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 93 | if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 { |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 94 | return fmt.Errorf("not enough bits to encode the nodes table") |
| 95 | } |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 96 | if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 { |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 97 | return fmt.Errorf("not enough bits to encode the children table") |
| 98 | } |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 99 | if *version == "" { |
| 100 | return fmt.Errorf("-version was not specified") |
| 101 | } |
| 102 | var r io.Reader = os.Stdin |
| 103 | if *url != "" { |
| 104 | res, err := http.Get(*url) |
| 105 | if err != nil { |
| 106 | return err |
| 107 | } |
| 108 | if res.StatusCode != http.StatusOK { |
| 109 | return fmt.Errorf("bad GET status for %s: %d", *url, res.Status) |
| 110 | } |
| 111 | r = res.Body |
| 112 | defer res.Body.Close() |
| 113 | } |
| 114 | |
| 115 | var root node |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 116 | icann := false |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 117 | buf := new(bytes.Buffer) |
| 118 | br := bufio.NewReader(r) |
| 119 | for { |
| 120 | s, err := br.ReadString('\n') |
| 121 | if err != nil { |
| 122 | if err == io.EOF { |
| 123 | break |
| 124 | } |
| 125 | return err |
| 126 | } |
| 127 | s = strings.TrimSpace(s) |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 128 | if strings.Contains(s, "BEGIN ICANN DOMAINS") { |
| 129 | icann = true |
| 130 | continue |
| 131 | } |
| 132 | if strings.Contains(s, "END ICANN DOMAINS") { |
| 133 | icann = false |
| 134 | continue |
| 135 | } |
Nigel Tao | cbecf2f | 2012-12-20 19:36:00 +1100 | [diff] [blame] | 136 | if s == "" || strings.HasPrefix(s, "//") { |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 137 | continue |
| 138 | } |
Nigel Tao | cbecf2f | 2012-12-20 19:36:00 +1100 | [diff] [blame] | 139 | s, err = idna.ToASCII(s) |
| 140 | if err != nil { |
| 141 | return err |
| 142 | } |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 143 | |
| 144 | if *subset { |
| 145 | switch { |
Nigel Tao | 6179114 | 2013-01-22 21:23:30 +1100 | [diff] [blame] | 146 | case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"): |
| 147 | case s == "ak.us" || strings.HasSuffix(s, ".ak.us"): |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 148 | case s == "ao" || strings.HasSuffix(s, ".ao"): |
| 149 | case s == "ar" || strings.HasSuffix(s, ".ar"): |
| 150 | case s == "arpa" || strings.HasSuffix(s, ".arpa"): |
Nigel Tao | 6179114 | 2013-01-22 21:23:30 +1100 | [diff] [blame] | 151 | case s == "cy" || strings.HasSuffix(s, ".cy"): |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 152 | case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"): |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 153 | case s == "jp": |
| 154 | case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"): |
| 155 | case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"): |
Nigel Tao | 6179114 | 2013-01-22 21:23:30 +1100 | [diff] [blame] | 156 | case s == "om" || strings.HasSuffix(s, ".om"): |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 157 | case s == "uk" || strings.HasSuffix(s, ".uk"): |
Nigel Tao | 6179114 | 2013-01-22 21:23:30 +1100 | [diff] [blame] | 158 | case s == "uk.com" || strings.HasSuffix(s, ".uk.com"): |
Nigel Tao | cbecf2f | 2012-12-20 19:36:00 +1100 | [diff] [blame] | 159 | case s == "tw" || strings.HasSuffix(s, ".tw"): |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 160 | case s == "zw" || strings.HasSuffix(s, ".zw"): |
Nigel Tao | cbecf2f | 2012-12-20 19:36:00 +1100 | [diff] [blame] | 161 | case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"): |
| 162 | // xn--p1ai is Russian-Cyrillic "рф". |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 163 | default: |
| 164 | continue |
| 165 | } |
| 166 | } |
| 167 | |
| 168 | rules = append(rules, s) |
| 169 | |
| 170 | nt, wildcard := nodeTypeNormal, false |
| 171 | switch { |
| 172 | case strings.HasPrefix(s, "*."): |
| 173 | s, nt = s[2:], nodeTypeParentOnly |
| 174 | wildcard = true |
| 175 | case strings.HasPrefix(s, "!"): |
| 176 | s, nt = s[1:], nodeTypeException |
| 177 | } |
| 178 | labels := strings.Split(s, ".") |
| 179 | for n, i := &root, len(labels)-1; i >= 0; i-- { |
| 180 | label := labels[i] |
| 181 | n = n.child(label) |
| 182 | if i == 0 { |
| 183 | if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly { |
| 184 | n.nodeType = nt |
| 185 | } |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 186 | n.icann = n.icann && icann |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 187 | n.wildcard = n.wildcard || wildcard |
| 188 | } |
| 189 | labelsMap[label] = true |
| 190 | } |
| 191 | } |
| 192 | labelsList = make([]string, 0, len(labelsMap)) |
| 193 | for label := range labelsMap { |
| 194 | labelsList = append(labelsList, label) |
| 195 | } |
| 196 | sort.Strings(labelsList) |
| 197 | |
| 198 | p := printReal |
| 199 | if *test { |
| 200 | p = printTest |
| 201 | } |
| 202 | if err := p(buf, &root); err != nil { |
| 203 | return err |
| 204 | } |
| 205 | |
| 206 | b, err := format.Source(buf.Bytes()) |
| 207 | if err != nil { |
| 208 | return err |
| 209 | } |
| 210 | _, err = os.Stdout.Write(b) |
| 211 | return err |
| 212 | } |
| 213 | |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 214 | func printTest(w io.Writer, n *node) error { |
| 215 | fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n") |
| 216 | fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n") |
| 217 | for _, rule := range rules { |
| 218 | fmt.Fprintf(w, "%q,\n", rule) |
| 219 | } |
| 220 | fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n") |
| 221 | if err := n.walk(w, printNodeLabel); err != nil { |
| 222 | return err |
| 223 | } |
| 224 | fmt.Fprintf(w, "}\n") |
| 225 | return nil |
| 226 | } |
| 227 | |
| 228 | func printReal(w io.Writer, n *node) error { |
| 229 | const header = `// generated by go run gen.go; DO NOT EDIT |
| 230 | |
| 231 | package publicsuffix |
| 232 | |
| 233 | const version = %q |
| 234 | |
| 235 | const ( |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 236 | nodesBitsChildren = %d |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 237 | nodesBitsICANN = %d |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 238 | nodesBitsTextOffset = %d |
| 239 | nodesBitsTextLength = %d |
| 240 | |
| 241 | childrenBitsWildcard = %d |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 242 | childrenBitsNodeType = %d |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 243 | childrenBitsHi = %d |
| 244 | childrenBitsLo = %d |
| 245 | ) |
| 246 | |
| 247 | const ( |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 248 | nodeTypeNormal = %d |
| 249 | nodeTypeException = %d |
| 250 | nodeTypeParentOnly = %d |
| 251 | ) |
| 252 | |
| 253 | // numTLD is the number of top level domains. |
| 254 | const numTLD = %d |
| 255 | |
| 256 | ` |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 257 | fmt.Fprintf(w, header, *version, |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 258 | nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength, |
| 259 | childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo, |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 260 | nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children)) |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 261 | |
| 262 | text := makeText() |
| 263 | if text == "" { |
| 264 | return fmt.Errorf("internal error: makeText returned no text") |
| 265 | } |
| 266 | for _, label := range labelsList { |
| 267 | offset, length := strings.Index(text, label), len(label) |
| 268 | if offset < 0 { |
| 269 | return fmt.Errorf("internal error: could not find %q in text %q", label, text) |
| 270 | } |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 271 | if offset >= 1<<nodesBitsTextOffset || length >= 1<<nodesBitsTextLength { |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 272 | return fmt.Errorf("text offset/length is too large: %d/%d", offset, length) |
| 273 | } |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 274 | labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length) |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 275 | } |
| 276 | fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ") |
| 277 | for len(text) > 0 { |
| 278 | n, plus := len(text), "" |
| 279 | if n > 64 { |
| 280 | n, plus = 64, " +" |
| 281 | } |
| 282 | fmt.Fprintf(w, "%q%s\n", text[:n], plus) |
| 283 | text = text[n:] |
| 284 | } |
| 285 | |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 286 | n.walk(w, assignIndexes) |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 287 | |
| 288 | fmt.Fprintf(w, ` |
| 289 | |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 290 | // nodes is the list of nodes. Each node is represented as a uint32, which |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 291 | // encodes the node's children, wildcard bit and node type (as an index into |
| 292 | // the children array), ICANN bit and text. |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 293 | // |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 294 | // In the //-comment after each node's data, the nodes indexes of the children |
| 295 | // are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The |
| 296 | // nodeType is printed as + for normal, ! for exception, and o for parent-only |
| 297 | // nodes that have children but don't match a domain label in their own right. |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 298 | // An I denotes an ICANN domain. |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 299 | // |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 300 | // The layout within the uint32, from MSB to LSB, is: |
| 301 | // [%2d bits] unused |
| 302 | // [%2d bits] children index |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 303 | // [%2d bits] ICANN bit |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 304 | // [%2d bits] text index |
| 305 | // [%2d bits] text length |
| 306 | var nodes = [...]uint32{ |
| 307 | `, |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 308 | 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength, |
| 309 | nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength) |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 310 | if err := n.walk(w, printNode); err != nil { |
| 311 | return err |
| 312 | } |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 313 | fmt.Fprintf(w, `} |
| 314 | |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 315 | // children is the list of nodes' children, the parent's wildcard bit and the |
| 316 | // parent's node type. If a node has no children then their children index |
| 317 | // will be in the range [0, 6), depending on the wildcard bit and node type. |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 318 | // |
| 319 | // The layout within the uint32, from MSB to LSB, is: |
| 320 | // [%2d bits] unused |
| 321 | // [%2d bits] wildcard bit |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 322 | // [%2d bits] node type |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 323 | // [%2d bits] high nodes index (exclusive) of children |
| 324 | // [%2d bits] low nodes index (inclusive) of children |
| 325 | var children=[...]uint32{ |
| 326 | `, |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 327 | 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo, |
| 328 | childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo) |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 329 | for i, c := range childrenEncoding { |
| 330 | s := "---------------" |
| 331 | lo := c & (1<<childrenBitsLo - 1) |
| 332 | hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1) |
| 333 | if lo != hi { |
| 334 | s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi) |
| 335 | } |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 336 | nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1) |
| 337 | wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0 |
| 338 | fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n", |
| 339 | c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType)) |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 340 | } |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 341 | fmt.Fprintf(w, "}\n") |
| 342 | return nil |
| 343 | } |
| 344 | |
| 345 | type node struct { |
| 346 | label string |
| 347 | nodeType int |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 348 | icann bool |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 349 | wildcard bool |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 350 | // nodesIndex and childrenIndex are the index of this node in the nodes |
| 351 | // and the index of its children offset/length in the children arrays. |
| 352 | nodesIndex, childrenIndex int |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 353 | // firstChild is the index of this node's first child, or zero if this |
| 354 | // node has no children. |
| 355 | firstChild int |
| 356 | // children are the node's children, in strictly increasing node label order. |
| 357 | children []*node |
| 358 | } |
| 359 | |
| 360 | func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error { |
| 361 | if err := f(w, n); err != nil { |
| 362 | return err |
| 363 | } |
| 364 | for _, c := range n.children { |
| 365 | if err := c.walk(w, f); err != nil { |
| 366 | return err |
| 367 | } |
| 368 | } |
| 369 | return nil |
| 370 | } |
| 371 | |
| 372 | // child returns the child of n with the given label. The child is created if |
| 373 | // it did not exist beforehand. |
| 374 | func (n *node) child(label string) *node { |
| 375 | for _, c := range n.children { |
| 376 | if c.label == label { |
| 377 | return c |
| 378 | } |
| 379 | } |
| 380 | c := &node{ |
| 381 | label: label, |
| 382 | nodeType: nodeTypeParentOnly, |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 383 | icann: true, |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 384 | } |
| 385 | n.children = append(n.children, c) |
| 386 | sort.Sort(byLabel(n.children)) |
| 387 | return c |
| 388 | } |
| 389 | |
| 390 | type byLabel []*node |
| 391 | |
| 392 | func (b byLabel) Len() int { return len(b) } |
| 393 | func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
| 394 | func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label } |
| 395 | |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 396 | var nextNodesIndex int |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 397 | |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 398 | // childrenEncoding are the encoded entries in the generated children array. |
| 399 | // All these pre-defined entries have no children. |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 400 | var childrenEncoding = []uint32{ |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 401 | 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal. |
| 402 | 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException. |
| 403 | 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly. |
| 404 | 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal. |
| 405 | 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException. |
| 406 | 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly. |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 407 | } |
| 408 | |
| 409 | var firstCallToAssignIndexes = true |
| 410 | |
| 411 | func assignIndexes(w io.Writer, n *node) error { |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 412 | if len(n.children) != 0 { |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 413 | // Assign nodesIndex. |
| 414 | n.firstChild = nextNodesIndex |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 415 | for _, c := range n.children { |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 416 | c.nodesIndex = nextNodesIndex |
| 417 | nextNodesIndex++ |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 418 | } |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 419 | |
| 420 | // The root node's children is implicit. |
| 421 | if firstCallToAssignIndexes { |
| 422 | firstCallToAssignIndexes = false |
| 423 | return nil |
| 424 | } |
| 425 | |
| 426 | // Assign childrenIndex. |
| 427 | if len(childrenEncoding) >= 1<<nodesBitsChildren { |
| 428 | return fmt.Errorf("children table is too large") |
| 429 | } |
| 430 | n.childrenIndex = len(childrenEncoding) |
| 431 | lo := uint32(n.firstChild) |
| 432 | hi := lo + uint32(len(n.children)) |
| 433 | if lo >= 1<<childrenBitsLo || hi >= 1<<childrenBitsHi { |
| 434 | return fmt.Errorf("children lo/hi is too large: %d/%d", lo, hi) |
| 435 | } |
| 436 | enc := hi<<childrenBitsLo | lo |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 437 | enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi) |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 438 | if n.wildcard { |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 439 | enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType) |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 440 | } |
| 441 | childrenEncoding = append(childrenEncoding, enc) |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 442 | } else { |
| 443 | n.childrenIndex = n.nodeType |
| 444 | if n.wildcard { |
| 445 | n.childrenIndex += numNodeType |
| 446 | } |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 447 | } |
| 448 | return nil |
| 449 | } |
| 450 | |
| 451 | func printNode(w io.Writer, n *node) error { |
| 452 | for _, c := range n.children { |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 453 | s := "---------------" |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 454 | if len(c.children) != 0 { |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 455 | s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children)) |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 456 | } |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 457 | encoding := labelEncoding[c.label] |
| 458 | if c.icann { |
| 459 | encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset) |
| 460 | } |
| 461 | encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN) |
| 462 | fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n", |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 463 | encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard), |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 464 | nodeTypeStr(c.nodeType), icannStr(c.icann), c.label, |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 465 | ) |
| 466 | } |
| 467 | return nil |
| 468 | } |
| 469 | |
| 470 | func printNodeLabel(w io.Writer, n *node) error { |
| 471 | for _, c := range n.children { |
| 472 | fmt.Fprintf(w, "%q,\n", c.label) |
| 473 | } |
| 474 | return nil |
| 475 | } |
| 476 | |
Nigel Tao | b8ab510 | 2013-01-09 22:10:50 +1100 | [diff] [blame] | 477 | func icannStr(icann bool) string { |
| 478 | if icann { |
| 479 | return "I" |
| 480 | } |
| 481 | return " " |
| 482 | } |
| 483 | |
Nigel Tao | 0f34b77 | 2012-12-22 12:09:13 +1100 | [diff] [blame] | 484 | func wildcardStr(wildcard bool) string { |
| 485 | if wildcard { |
| 486 | return "*" |
| 487 | } |
| 488 | return " " |
| 489 | } |
| 490 | |
Nigel Tao | 67a3048 | 2012-12-12 15:58:52 +1100 | [diff] [blame] | 491 | // makeText combines all the strings in labelsList to form one giant string. |
| 492 | // If the crush flag is true, then overlapping strings will be merged: "arpa" |
| 493 | // and "parliament" could yield "arparliament". |
| 494 | func makeText() string { |
| 495 | if !*crush { |
| 496 | return strings.Join(labelsList, "") |
| 497 | } |
| 498 | |
| 499 | beforeLength := 0 |
| 500 | for _, s := range labelsList { |
| 501 | beforeLength += len(s) |
| 502 | } |
| 503 | |
| 504 | // Make a copy of labelsList. |
| 505 | ss := append(make([]string, 0, len(labelsList)), labelsList...) |
| 506 | |
| 507 | // Remove strings that are substrings of other strings. |
| 508 | for changed := true; changed; { |
| 509 | changed = false |
| 510 | for i, s := range ss { |
| 511 | if s == "" { |
| 512 | continue |
| 513 | } |
| 514 | for j, t := range ss { |
| 515 | if i != j && t != "" && strings.Contains(s, t) { |
| 516 | changed = true |
| 517 | ss[j] = "" |
| 518 | } |
| 519 | } |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | // Remove the empty strings. |
| 524 | sort.Strings(ss) |
| 525 | for len(ss) > 0 && ss[0] == "" { |
| 526 | ss = ss[1:] |
| 527 | } |
| 528 | |
| 529 | // Join strings where one suffix matches another prefix. |
| 530 | for { |
| 531 | // Find best i, j, k such that ss[i][len-k:] == ss[j][:k], |
| 532 | // maximizing overlap length k. |
| 533 | besti := -1 |
| 534 | bestj := -1 |
| 535 | bestk := 0 |
| 536 | for i, s := range ss { |
| 537 | if s == "" { |
| 538 | continue |
| 539 | } |
| 540 | for j, t := range ss { |
| 541 | if i == j { |
| 542 | continue |
| 543 | } |
| 544 | for k := bestk + 1; k <= len(s) && k <= len(t); k++ { |
| 545 | if s[len(s)-k:] == t[:k] { |
| 546 | besti = i |
| 547 | bestj = j |
| 548 | bestk = k |
| 549 | } |
| 550 | } |
| 551 | } |
| 552 | } |
| 553 | if bestk > 0 { |
| 554 | if *v { |
| 555 | fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n", |
| 556 | bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj]) |
| 557 | } |
| 558 | ss[besti] += ss[bestj][bestk:] |
| 559 | ss[bestj] = "" |
| 560 | continue |
| 561 | } |
| 562 | break |
| 563 | } |
| 564 | |
| 565 | text := strings.Join(ss, "") |
| 566 | if *v { |
| 567 | fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text)) |
| 568 | } |
| 569 | return text |
| 570 | } |