blob: d71865966e022fd318585b98f390ccbff346f6cc [file] [log] [blame]
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates table.go and table_test.go.
// Invoke as:
//
// go run gen.go -version "xxx" >table.go
// go run gen.go -version "xxx" -test >table_test.go
//
// The version is derived from information found at
// http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat
// which is linked from http://publicsuffix.org/list/.
//
// To fetch a particular hg revision, such as 05b11a8d1ace, pass
// -url "http://hg.mozilla.org/mozilla-central/raw-file/05b11a8d1ace/netwerk/dns/effective_tld_names.dat"
import (
"bufio"
"bytes"
"flag"
"fmt"
"go/format"
"io"
"net/http"
"os"
"sort"
"strings"
"code.google.com/p/go.net/idna"
)
const (
nodeTypeNormal = 0
nodeTypeException = 1
nodeTypeParentOnly = 2
)
func nodeTypeString(n int) string {
switch n {
case nodeTypeNormal:
return "+"
case nodeTypeException:
return "!"
case nodeTypeParentOnly:
return "o"
}
panic("unreachable")
}
var (
labelEncoding = map[string]uint32{}
labelsList = []string{}
labelsMap = map[string]bool{}
rules = []string{}
crush = flag.Bool("crush", true, "make the generated node text as small as possible")
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
url = flag.String("url",
"http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1",
"URL of the publicsuffix.org list. If empty, stdin is read instead")
v = flag.Bool("v", false, "verbose output (to stderr)")
version = flag.String("version", "", "the effective_tld_names.dat version")
test = flag.Bool("test", false, "generate table_test.go")
)
func main() {
if err := main1(); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}
func main1() error {
flag.Parse()
if *version == "" {
return fmt.Errorf("-version was not specified")
}
var r io.Reader = os.Stdin
if *url != "" {
res, err := http.Get(*url)
if err != nil {
return err
}
if res.StatusCode != http.StatusOK {
return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
}
r = res.Body
defer res.Body.Close()
}
var root node
buf := new(bytes.Buffer)
br := bufio.NewReader(r)
for {
s, err := br.ReadString('\n')
if err != nil {
if err == io.EOF {
break
}
return err
}
s = strings.TrimSpace(s)
if s == "" || strings.HasPrefix(s, "//") {
continue
}
s, err = idna.ToASCII(s)
if err != nil {
return err
}
if *subset {
switch {
case s == "ao" || strings.HasSuffix(s, ".ao"):
case s == "ar" || strings.HasSuffix(s, ".ar"):
case s == "arpa" || strings.HasSuffix(s, ".arpa"):
case s == "jp":
case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
case s == "uk" || strings.HasSuffix(s, ".uk"):
case s == "tw" || strings.HasSuffix(s, ".tw"):
case s == "zw" || strings.HasSuffix(s, ".zw"):
case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
// xn--p1ai is Russian-Cyrillic "рф".
default:
continue
}
}
rules = append(rules, s)
nt, wildcard := nodeTypeNormal, false
switch {
case strings.HasPrefix(s, "*."):
s, nt = s[2:], nodeTypeParentOnly
wildcard = true
case strings.HasPrefix(s, "!"):
s, nt = s[1:], nodeTypeException
}
labels := strings.Split(s, ".")
for n, i := &root, len(labels)-1; i >= 0; i-- {
label := labels[i]
n = n.child(label)
if i == 0 {
if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
n.nodeType = nt
}
n.wildcard = n.wildcard || wildcard
}
labelsMap[label] = true
}
}
labelsList = make([]string, 0, len(labelsMap))
for label := range labelsMap {
labelsList = append(labelsList, label)
}
sort.Strings(labelsList)
p := printReal
if *test {
p = printTest
}
if err := p(buf, &root); err != nil {
return err
}
b, err := format.Source(buf.Bytes())
if err != nil {
return err
}
_, err = os.Stdout.Write(b)
return err
}
func printTest(w io.Writer, n *node) error {
fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
for _, rule := range rules {
fmt.Fprintf(w, "%q,\n", rule)
}
fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
if err := n.walk(w, printNodeLabel); err != nil {
return err
}
fmt.Fprintf(w, "}\n")
return nil
}
func printReal(w io.Writer, n *node) error {
const header = `// generated by go run gen.go; DO NOT EDIT
package publicsuffix
const version = %q
const (
nodeTypeNormal = %d
nodeTypeException = %d
nodeTypeParentOnly = %d
)
// numTLD is the number of top level domains.
const numTLD = %d
`
fmt.Fprintf(w, header, *version, nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
text := makeText()
if text == "" {
return fmt.Errorf("internal error: makeText returned no text")
}
for _, label := range labelsList {
offset, length := strings.Index(text, label), len(label)
if offset < 0 {
return fmt.Errorf("internal error: could not find %q in text %q", label, text)
}
if offset >= 1<<24 || length >= 1<<8 {
return fmt.Errorf("text offset/length is too large: %d/%d", offset, length)
}
labelEncoding[label] = uint32(offset)<<8 | uint32(length)
}
fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
for len(text) > 0 {
n, plus := len(text), ""
if n > 64 {
n, plus = 64, " +"
}
fmt.Fprintf(w, "%q%s\n", text[:n], plus)
text = text[n:]
}
n.walk(w, assignNodeIndexes)
fmt.Fprintf(w, `
// Nodes is the list of nodes. Each node is encoded as two uint32 values.
//
// The first uint32 encodes the node's children, nodeType, and a wildcard bit.
// In the //-comment after each node's data, the indexes of the children are
// formatted as (0x1234-0x1256). The nodeType is printed as + for normal, ! for
// exception, and o for parent-only nodes that have children but don't match a
// domain in their own right. The * denotes the wildcard bit. The layout within
// the uint32, from MSB to LSB, is:
// [2] nodeType [1] wildcard [13] number of children [16] first child.
// If a node has no children then the low 29 bits are zero.
//
// The second uint32 encodes the node's text. The layout is:
// [24] text offset [8] text length.
//
// TODO(nigeltao): this table has a lot of zeroes, for childless nodes. It
// would be tight, but it should be possible to use only 32 bits per node
// instead of 64, with an offset into a parent-child table. A back-of-the-
// envelope calculation suggests that at 6000 rows (of which 90%% are leaves),
// this could save an extra 20KiB of data.
var nodes = [...][2]uint32{
`)
if err := n.walk(w, printNode); err != nil {
return err
}
fmt.Fprintf(w, "}\n")
return nil
}
type node struct {
label string
nodeType int
wildcard bool
// index is the index of this node in the nodes array.
index int
// firstChild is the index of this node's first child, or zero if this
// node has no children.
firstChild int
// children are the node's children, in strictly increasing node label order.
children []*node
}
func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
if err := f(w, n); err != nil {
return err
}
for _, c := range n.children {
if err := c.walk(w, f); err != nil {
return err
}
}
return nil
}
// child returns the child of n with the given label. The child is created if
// it did not exist beforehand.
func (n *node) child(label string) *node {
for _, c := range n.children {
if c.label == label {
return c
}
}
c := &node{
label: label,
nodeType: nodeTypeParentOnly,
}
n.children = append(n.children, c)
sort.Sort(byLabel(n.children))
return c
}
type byLabel []*node
func (b byLabel) Len() int { return len(b) }
func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
var nextNodeIndex int
func assignNodeIndexes(w io.Writer, n *node) error {
if len(n.children) != 0 {
n.firstChild = nextNodeIndex
for _, c := range n.children {
c.index = nextNodeIndex
nextNodeIndex++
}
}
return nil
}
func printNode(w io.Writer, n *node) error {
for _, c := range n.children {
s := "-------------"
if len(c.children) != 0 {
s = fmt.Sprintf("0x%04x-0x%04x", c.firstChild, c.firstChild+len(c.children))
}
wildcardBit, wildcardStr := uint32(0), ' '
if c.wildcard {
wildcardBit, wildcardStr = 1<<29, '*'
}
if c.firstChild >= 1<<16 || len(c.children) >= 1<<13 {
return fmt.Errorf("nodes offset/length is too large: %d/%d", c.firstChild, len(c.children))
}
encoding := uint32(c.nodeType<<30) | wildcardBit | uint32(len(c.children)<<16) | uint32(c.firstChild)
fmt.Fprintf(w, "{0x%08x, 0x%08x}, // 0x%04x (%s) %s%c %s\n",
encoding, labelEncoding[c.label], c.index, s,
nodeTypeString(c.nodeType), wildcardStr, c.label,
)
}
return nil
}
func printNodeLabel(w io.Writer, n *node) error {
for _, c := range n.children {
fmt.Fprintf(w, "%q,\n", c.label)
}
return nil
}
// makeText combines all the strings in labelsList to form one giant string.
// If the crush flag is true, then overlapping strings will be merged: "arpa"
// and "parliament" could yield "arparliament".
func makeText() string {
if !*crush {
return strings.Join(labelsList, "")
}
beforeLength := 0
for _, s := range labelsList {
beforeLength += len(s)
}
// Make a copy of labelsList.
ss := append(make([]string, 0, len(labelsList)), labelsList...)
// Remove strings that are substrings of other strings.
for changed := true; changed; {
changed = false
for i, s := range ss {
if s == "" {
continue
}
for j, t := range ss {
if i != j && t != "" && strings.Contains(s, t) {
changed = true
ss[j] = ""
}
}
}
}
// Remove the empty strings.
sort.Strings(ss)
for len(ss) > 0 && ss[0] == "" {
ss = ss[1:]
}
// Join strings where one suffix matches another prefix.
for {
// Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
// maximizing overlap length k.
besti := -1
bestj := -1
bestk := 0
for i, s := range ss {
if s == "" {
continue
}
for j, t := range ss {
if i == j {
continue
}
for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
if s[len(s)-k:] == t[:k] {
besti = i
bestj = j
bestk = k
}
}
}
}
if bestk > 0 {
if *v {
fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
}
ss[besti] += ss[bestj][bestk:]
ss[bestj] = ""
continue
}
break
}
text := strings.Join(ss, "")
if *v {
fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
}
return text
}