| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // +build ignore |
| |
| package main |
| |
| // This program generates table.go and table_test.go. |
| // Invoke as: |
| // |
| // go run gen.go -version "xxx" >table.go |
| // go run gen.go -version "xxx" -test >table_test.go |
| // |
| // The version is derived from information found at |
| // http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat |
| // which is linked from http://publicsuffix.org/list/. |
| // |
| // To fetch a particular hg revision, such as 05b11a8d1ace, pass |
| // -url "http://hg.mozilla.org/mozilla-central/raw-file/05b11a8d1ace/netwerk/dns/effective_tld_names.dat" |
| |
| import ( |
| "bufio" |
| "bytes" |
| "flag" |
| "fmt" |
| "go/format" |
| "io" |
| "net/http" |
| "os" |
| "sort" |
| "strings" |
| |
| "code.google.com/p/go.net/idna" |
| ) |
| |
| const ( |
| nodeTypeNormal = 0 |
| nodeTypeException = 1 |
| nodeTypeParentOnly = 2 |
| ) |
| |
| func nodeTypeString(n int) string { |
| switch n { |
| case nodeTypeNormal: |
| return "+" |
| case nodeTypeException: |
| return "!" |
| case nodeTypeParentOnly: |
| return "o" |
| } |
| panic("unreachable") |
| } |
| |
| var ( |
| labelEncoding = map[string]uint32{} |
| labelsList = []string{} |
| labelsMap = map[string]bool{} |
| rules = []string{} |
| |
| crush = flag.Bool("crush", true, "make the generated node text as small as possible") |
| subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging") |
| url = flag.String("url", |
| "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1", |
| "URL of the publicsuffix.org list. If empty, stdin is read instead") |
| v = flag.Bool("v", false, "verbose output (to stderr)") |
| version = flag.String("version", "", "the effective_tld_names.dat version") |
| test = flag.Bool("test", false, "generate table_test.go") |
| ) |
| |
| func main() { |
| if err := main1(); err != nil { |
| fmt.Fprintln(os.Stderr, err) |
| os.Exit(1) |
| } |
| } |
| |
| func main1() error { |
| flag.Parse() |
| if *version == "" { |
| return fmt.Errorf("-version was not specified") |
| } |
| var r io.Reader = os.Stdin |
| if *url != "" { |
| res, err := http.Get(*url) |
| if err != nil { |
| return err |
| } |
| if res.StatusCode != http.StatusOK { |
| return fmt.Errorf("bad GET status for %s: %d", *url, res.Status) |
| } |
| r = res.Body |
| defer res.Body.Close() |
| } |
| |
| var root node |
| buf := new(bytes.Buffer) |
| br := bufio.NewReader(r) |
| for { |
| s, err := br.ReadString('\n') |
| if err != nil { |
| if err == io.EOF { |
| break |
| } |
| return err |
| } |
| s = strings.TrimSpace(s) |
| if s == "" || strings.HasPrefix(s, "//") { |
| continue |
| } |
| s, err = idna.ToASCII(s) |
| if err != nil { |
| return err |
| } |
| |
| if *subset { |
| switch { |
| case s == "ao" || strings.HasSuffix(s, ".ao"): |
| case s == "ar" || strings.HasSuffix(s, ".ar"): |
| case s == "arpa" || strings.HasSuffix(s, ".arpa"): |
| case s == "jp": |
| case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"): |
| case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"): |
| case s == "uk" || strings.HasSuffix(s, ".uk"): |
| case s == "tw" || strings.HasSuffix(s, ".tw"): |
| case s == "zw" || strings.HasSuffix(s, ".zw"): |
| case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"): |
| // xn--p1ai is Russian-Cyrillic "рф". |
| default: |
| continue |
| } |
| } |
| |
| rules = append(rules, s) |
| |
| nt, wildcard := nodeTypeNormal, false |
| switch { |
| case strings.HasPrefix(s, "*."): |
| s, nt = s[2:], nodeTypeParentOnly |
| wildcard = true |
| case strings.HasPrefix(s, "!"): |
| s, nt = s[1:], nodeTypeException |
| } |
| labels := strings.Split(s, ".") |
| for n, i := &root, len(labels)-1; i >= 0; i-- { |
| label := labels[i] |
| n = n.child(label) |
| if i == 0 { |
| if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly { |
| n.nodeType = nt |
| } |
| n.wildcard = n.wildcard || wildcard |
| } |
| labelsMap[label] = true |
| } |
| } |
| labelsList = make([]string, 0, len(labelsMap)) |
| for label := range labelsMap { |
| labelsList = append(labelsList, label) |
| } |
| sort.Strings(labelsList) |
| |
| p := printReal |
| if *test { |
| p = printTest |
| } |
| if err := p(buf, &root); err != nil { |
| return err |
| } |
| |
| b, err := format.Source(buf.Bytes()) |
| if err != nil { |
| return err |
| } |
| _, err = os.Stdout.Write(b) |
| return err |
| } |
| |
| func printTest(w io.Writer, n *node) error { |
| fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n") |
| fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n") |
| for _, rule := range rules { |
| fmt.Fprintf(w, "%q,\n", rule) |
| } |
| fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n") |
| if err := n.walk(w, printNodeLabel); err != nil { |
| return err |
| } |
| fmt.Fprintf(w, "}\n") |
| return nil |
| } |
| |
| func printReal(w io.Writer, n *node) error { |
| const header = `// generated by go run gen.go; DO NOT EDIT |
| |
| package publicsuffix |
| |
| const version = %q |
| |
| const ( |
| nodeTypeNormal = %d |
| nodeTypeException = %d |
| nodeTypeParentOnly = %d |
| ) |
| |
| // numTLD is the number of top level domains. |
| const numTLD = %d |
| |
| ` |
| fmt.Fprintf(w, header, *version, nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children)) |
| |
| text := makeText() |
| if text == "" { |
| return fmt.Errorf("internal error: makeText returned no text") |
| } |
| for _, label := range labelsList { |
| offset, length := strings.Index(text, label), len(label) |
| if offset < 0 { |
| return fmt.Errorf("internal error: could not find %q in text %q", label, text) |
| } |
| if offset >= 1<<24 || length >= 1<<8 { |
| return fmt.Errorf("text offset/length is too large: %d/%d", offset, length) |
| } |
| labelEncoding[label] = uint32(offset)<<8 | uint32(length) |
| } |
| fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ") |
| for len(text) > 0 { |
| n, plus := len(text), "" |
| if n > 64 { |
| n, plus = 64, " +" |
| } |
| fmt.Fprintf(w, "%q%s\n", text[:n], plus) |
| text = text[n:] |
| } |
| |
| n.walk(w, assignNodeIndexes) |
| |
| fmt.Fprintf(w, ` |
| |
| // Nodes is the list of nodes. Each node is encoded as two uint32 values. |
| // |
| // The first uint32 encodes the node's children, nodeType, and a wildcard bit. |
| // In the //-comment after each node's data, the indexes of the children are |
| // formatted as (0x1234-0x1256). The nodeType is printed as + for normal, ! for |
| // exception, and o for parent-only nodes that have children but don't match a |
| // domain in their own right. The * denotes the wildcard bit. The layout within |
| // the uint32, from MSB to LSB, is: |
| // [2] nodeType [1] wildcard [13] number of children [16] first child. |
| // If a node has no children then the low 29 bits are zero. |
| // |
| // The second uint32 encodes the node's text. The layout is: |
| // [24] text offset [8] text length. |
| // |
| // TODO(nigeltao): this table has a lot of zeroes, for childless nodes. It |
| // would be tight, but it should be possible to use only 32 bits per node |
| // instead of 64, with an offset into a parent-child table. A back-of-the- |
| // envelope calculation suggests that at 6000 rows (of which 90%% are leaves), |
| // this could save an extra 20KiB of data. |
| var nodes = [...][2]uint32{ |
| `) |
| if err := n.walk(w, printNode); err != nil { |
| return err |
| } |
| fmt.Fprintf(w, "}\n") |
| return nil |
| } |
| |
| type node struct { |
| label string |
| nodeType int |
| wildcard bool |
| // index is the index of this node in the nodes array. |
| index int |
| // firstChild is the index of this node's first child, or zero if this |
| // node has no children. |
| firstChild int |
| // children are the node's children, in strictly increasing node label order. |
| children []*node |
| } |
| |
| func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error { |
| if err := f(w, n); err != nil { |
| return err |
| } |
| for _, c := range n.children { |
| if err := c.walk(w, f); err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| // child returns the child of n with the given label. The child is created if |
| // it did not exist beforehand. |
| func (n *node) child(label string) *node { |
| for _, c := range n.children { |
| if c.label == label { |
| return c |
| } |
| } |
| c := &node{ |
| label: label, |
| nodeType: nodeTypeParentOnly, |
| } |
| n.children = append(n.children, c) |
| sort.Sort(byLabel(n.children)) |
| return c |
| } |
| |
| type byLabel []*node |
| |
| func (b byLabel) Len() int { return len(b) } |
| func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
| func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label } |
| |
| var nextNodeIndex int |
| |
| func assignNodeIndexes(w io.Writer, n *node) error { |
| if len(n.children) != 0 { |
| n.firstChild = nextNodeIndex |
| for _, c := range n.children { |
| c.index = nextNodeIndex |
| nextNodeIndex++ |
| } |
| } |
| return nil |
| } |
| |
| func printNode(w io.Writer, n *node) error { |
| for _, c := range n.children { |
| s := "-------------" |
| if len(c.children) != 0 { |
| s = fmt.Sprintf("0x%04x-0x%04x", c.firstChild, c.firstChild+len(c.children)) |
| } |
| wildcardBit, wildcardStr := uint32(0), ' ' |
| if c.wildcard { |
| wildcardBit, wildcardStr = 1<<29, '*' |
| } |
| if c.firstChild >= 1<<16 || len(c.children) >= 1<<13 { |
| return fmt.Errorf("nodes offset/length is too large: %d/%d", c.firstChild, len(c.children)) |
| } |
| encoding := uint32(c.nodeType<<30) | wildcardBit | uint32(len(c.children)<<16) | uint32(c.firstChild) |
| fmt.Fprintf(w, "{0x%08x, 0x%08x}, // 0x%04x (%s) %s%c %s\n", |
| encoding, labelEncoding[c.label], c.index, s, |
| nodeTypeString(c.nodeType), wildcardStr, c.label, |
| ) |
| } |
| return nil |
| } |
| |
| func printNodeLabel(w io.Writer, n *node) error { |
| for _, c := range n.children { |
| fmt.Fprintf(w, "%q,\n", c.label) |
| } |
| return nil |
| } |
| |
| // makeText combines all the strings in labelsList to form one giant string. |
| // If the crush flag is true, then overlapping strings will be merged: "arpa" |
| // and "parliament" could yield "arparliament". |
| func makeText() string { |
| if !*crush { |
| return strings.Join(labelsList, "") |
| } |
| |
| beforeLength := 0 |
| for _, s := range labelsList { |
| beforeLength += len(s) |
| } |
| |
| // Make a copy of labelsList. |
| ss := append(make([]string, 0, len(labelsList)), labelsList...) |
| |
| // Remove strings that are substrings of other strings. |
| for changed := true; changed; { |
| changed = false |
| for i, s := range ss { |
| if s == "" { |
| continue |
| } |
| for j, t := range ss { |
| if i != j && t != "" && strings.Contains(s, t) { |
| changed = true |
| ss[j] = "" |
| } |
| } |
| } |
| } |
| |
| // Remove the empty strings. |
| sort.Strings(ss) |
| for len(ss) > 0 && ss[0] == "" { |
| ss = ss[1:] |
| } |
| |
| // Join strings where one suffix matches another prefix. |
| for { |
| // Find best i, j, k such that ss[i][len-k:] == ss[j][:k], |
| // maximizing overlap length k. |
| besti := -1 |
| bestj := -1 |
| bestk := 0 |
| for i, s := range ss { |
| if s == "" { |
| continue |
| } |
| for j, t := range ss { |
| if i == j { |
| continue |
| } |
| for k := bestk + 1; k <= len(s) && k <= len(t); k++ { |
| if s[len(s)-k:] == t[:k] { |
| besti = i |
| bestj = j |
| bestk = k |
| } |
| } |
| } |
| } |
| if bestk > 0 { |
| if *v { |
| fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n", |
| bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj]) |
| } |
| ss[besti] += ss[bestj][bestk:] |
| ss[bestj] = "" |
| continue |
| } |
| break |
| } |
| |
| text := strings.Join(ss, "") |
| if *v { |
| fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text)) |
| } |
| return text |
| } |