| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package html |
| |
| import ( |
| "io" |
| "os" |
| ) |
| |
| // A NodeType is the type of a Node. |
| type NodeType int |
| |
| const ( |
| ErrorNode NodeType = iota |
| TextNode |
| DocumentNode |
| ElementNode |
| CommentNode |
| ) |
| |
| // A Node consists of a NodeType and some Data (tag name for element nodes, |
| // content for text) and are part of a tree of Nodes. Element nodes may also |
| // contain a slice of Attributes. Data is unescaped, so that it looks like |
| // "a<b" rather than "a<b". |
| type Node struct { |
| Parent *Node |
| Child []*Node |
| Type NodeType |
| Data string |
| Attr []Attribute |
| } |
| |
| // A parser implements the HTML5 parsing algorithm: |
| // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction |
| type parser struct { |
| // tokenizer provides the tokens for the parser. |
| tokenizer *Tokenizer |
| // tok is the most recently read token. |
| tok Token |
| // Self-closing tags like <hr/> are re-interpreted as a two-token sequence: |
| // <hr> followed by </hr>. hasSelfClosingToken is true if we have just read |
| // the synthetic start tag and the next one due is the matching end tag. |
| hasSelfClosingToken bool |
| // doc is the document root element. |
| doc *Node |
| // The stack of open elements (section 10.2.3.2). |
| stack []*Node |
| // Element pointers (section 10.2.3.4). |
| head, form *Node |
| // Other parsing state flags (section 10.2.3.5). |
| scripting, framesetOK bool |
| } |
| |
| // pop pops the top of the stack of open elements. |
| // It will panic if the stack is empty. |
| func (p *parser) pop() *Node { |
| n := len(p.stack) |
| ret := p.stack[n-1] |
| p.stack = p.stack[:n-1] |
| return ret |
| } |
| |
| // push pushes onto the stack of open elements. |
| func (p *parser) push(n *Node) { |
| p.stack = append(p.stack, n) |
| } |
| |
| // top returns the top of the stack of open elements. |
| // This is also known as the current node. |
| func (p *parser) top() *Node { |
| if n := len(p.stack); n > 0 { |
| return p.stack[n-1] |
| } |
| return p.doc |
| } |
| |
| // addChild adds a child node n to the top element, and pushes n |
| // if it is an element node (text nodes do not have children). |
| func (p *parser) addChild(n *Node) { |
| m := p.top() |
| m.Child = append(m.Child, n) |
| if n.Type == ElementNode { |
| p.push(n) |
| } |
| } |
| |
| // addText adds text to the current node. |
| func (p *parser) addText(s string) { |
| // TODO(nigeltao): merge s with previous text, if the preceding node is a text node. |
| // TODO(nigeltao): distinguish whitespace text from others. |
| p.addChild(&Node{ |
| Type: TextNode, |
| Data: s, |
| }) |
| } |
| |
| // Section 10.2.3.3. |
| func (p *parser) addFormattingElement(n *Node) { |
| p.addChild(n) |
| // TODO. |
| } |
| |
| // Section 10.2.3.3. |
| func (p *parser) reconstructActiveFormattingElements() { |
| // TODO. |
| } |
| |
| // read reads the next token. This is usually from the tokenizer, but it may |
| // be the synthesized end tag implied by a self-closing tag. |
| func (p *parser) read() os.Error { |
| if p.hasSelfClosingToken { |
| p.hasSelfClosingToken = false |
| p.tok.Type = EndTagToken |
| p.tok.Attr = nil |
| return nil |
| } |
| p.tokenizer.Next() |
| p.tok = p.tokenizer.Token() |
| switch p.tok.Type { |
| case ErrorToken: |
| return p.tokenizer.Error() |
| case SelfClosingTagToken: |
| p.hasSelfClosingToken = true |
| p.tok.Type = StartTagToken |
| } |
| return nil |
| } |
| |
| // Section 10.2.4. |
| func (p *parser) acknowledgeSelfClosingTag() { |
| p.hasSelfClosingToken = false |
| } |
| |
| // An insertion mode (section 10.2.3.1) is the state transition function from |
| // a particular state in the HTML5 parser's state machine. It updates the |
| // parser's fields depending on parser.token (where ErrorToken means EOF). In |
| // addition to returning the next insertionMode state, it also returns whether |
| // the token was consumed. |
| type insertionMode func(*parser) (insertionMode, bool) |
| |
| // Section 10.2.5.4. |
| func initialInsertionMode(p *parser) (insertionMode, bool) { |
| // TODO(nigeltao): check p.tok for DOCTYPE. |
| return beforeHTMLInsertionMode, false |
| } |
| |
| // Section 10.2.5.5. |
| func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { |
| var ( |
| add bool |
| attr []Attribute |
| implied bool |
| ) |
| switch p.tok.Type { |
| case ErrorToken: |
| implied = true |
| case TextToken: |
| // TODO(nigeltao): distinguish whitespace text from others. |
| implied = true |
| case StartTagToken: |
| if p.tok.Data == "html" { |
| add = true |
| attr = p.tok.Attr |
| } else { |
| implied = true |
| } |
| case EndTagToken: |
| switch p.tok.Data { |
| case "head", "body", "html", "br": |
| implied = true |
| default: |
| // Ignore the token. |
| } |
| } |
| if add || implied { |
| p.addChild(&Node{ |
| Type: ElementNode, |
| Data: "html", |
| Attr: attr, |
| }) |
| } |
| return beforeHeadInsertionMode, !implied |
| } |
| |
| // Section 10.2.5.6. |
| func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { |
| var ( |
| add bool |
| attr []Attribute |
| implied bool |
| ) |
| switch p.tok.Type { |
| case ErrorToken: |
| implied = true |
| case TextToken: |
| // TODO(nigeltao): distinguish whitespace text from others. |
| implied = true |
| case StartTagToken: |
| switch p.tok.Data { |
| case "head": |
| add = true |
| attr = p.tok.Attr |
| case "html": |
| return inBodyInsertionMode, false |
| default: |
| implied = true |
| } |
| case EndTagToken: |
| switch p.tok.Data { |
| case "head", "body", "html", "br": |
| implied = true |
| default: |
| // Ignore the token. |
| } |
| } |
| if add || implied { |
| p.addChild(&Node{ |
| Type: ElementNode, |
| Data: "head", |
| Attr: attr, |
| }) |
| } |
| return inHeadInsertionMode, !implied |
| } |
| |
| // Section 10.2.5.7. |
| func inHeadInsertionMode(p *parser) (insertionMode, bool) { |
| var ( |
| pop bool |
| implied bool |
| ) |
| switch p.tok.Type { |
| case ErrorToken, TextToken: |
| implied = true |
| case StartTagToken: |
| switch p.tok.Data { |
| case "meta": |
| // TODO. |
| case "script": |
| // TODO. |
| default: |
| implied = true |
| } |
| case EndTagToken: |
| if p.tok.Data == "head" { |
| pop = true |
| } |
| // TODO. |
| } |
| if pop || implied { |
| n := p.pop() |
| if n.Data != "head" { |
| panic("html: bad parser state") |
| } |
| return afterHeadInsertionMode, !implied |
| } |
| return inHeadInsertionMode, !implied |
| } |
| |
| // Section 10.2.5.9. |
| func afterHeadInsertionMode(p *parser) (insertionMode, bool) { |
| var ( |
| add bool |
| attr []Attribute |
| framesetOK bool |
| implied bool |
| ) |
| switch p.tok.Type { |
| case ErrorToken, TextToken: |
| implied = true |
| framesetOK = true |
| case StartTagToken: |
| switch p.tok.Data { |
| case "html": |
| // TODO. |
| case "body": |
| add = true |
| attr = p.tok.Attr |
| framesetOK = false |
| case "frameset": |
| // TODO. |
| case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title": |
| // TODO. |
| case "head": |
| // TODO. |
| default: |
| implied = true |
| framesetOK = true |
| } |
| case EndTagToken: |
| // TODO. |
| } |
| if add || implied { |
| p.addChild(&Node{ |
| Type: ElementNode, |
| Data: "body", |
| Attr: attr, |
| }) |
| p.framesetOK = framesetOK |
| } |
| return inBodyInsertionMode, !implied |
| } |
| |
| // Section 10.2.5.10. |
| func inBodyInsertionMode(p *parser) (insertionMode, bool) { |
| var endP bool |
| switch p.tok.Type { |
| case ErrorToken: |
| // No-op. |
| case TextToken: |
| p.addText(p.tok.Data) |
| p.framesetOK = false |
| case StartTagToken: |
| switch p.tok.Data { |
| case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul": |
| // TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2. |
| n := p.top() |
| if n.Type == ElementNode && n.Data == "p" { |
| endP = true |
| } else { |
| p.addChild(&Node{ |
| Type: ElementNode, |
| Data: p.tok.Data, |
| Attr: p.tok.Attr, |
| }) |
| } |
| case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u": |
| p.reconstructActiveFormattingElements() |
| p.addFormattingElement(&Node{ |
| Type: ElementNode, |
| Data: p.tok.Data, |
| Attr: p.tok.Attr, |
| }) |
| case "area", "br", "embed", "img", "input", "keygen", "wbr": |
| p.reconstructActiveFormattingElements() |
| p.addChild(&Node{ |
| Type: ElementNode, |
| Data: p.tok.Data, |
| Attr: p.tok.Attr, |
| }) |
| p.pop() |
| p.acknowledgeSelfClosingTag() |
| p.framesetOK = false |
| case "hr": |
| // TODO(nigeltao): auto-insert </p> if necessary. |
| p.addChild(&Node{ |
| Type: ElementNode, |
| Data: p.tok.Data, |
| Attr: p.tok.Attr, |
| }) |
| p.pop() |
| p.acknowledgeSelfClosingTag() |
| p.framesetOK = false |
| default: |
| // TODO. |
| } |
| case EndTagToken: |
| switch p.tok.Data { |
| case "body": |
| // TODO(nigeltao): autoclose the stack of open elements. |
| return afterBodyInsertionMode, true |
| case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u": |
| // TODO(nigeltao): implement the "adoption agency" algorithm: |
| // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency |
| p.pop() |
| default: |
| // TODO. |
| } |
| } |
| if endP { |
| // TODO(nigeltao): do the proper algorithm. |
| n := p.pop() |
| if n.Type != ElementNode || n.Data != "p" { |
| panic("unreachable") |
| } |
| } |
| return inBodyInsertionMode, !endP |
| } |
| |
| // Section 10.2.5.22. |
| func afterBodyInsertionMode(p *parser) (insertionMode, bool) { |
| switch p.tok.Type { |
| case ErrorToken: |
| // TODO. |
| case TextToken: |
| // TODO. |
| case StartTagToken: |
| // TODO. |
| case EndTagToken: |
| switch p.tok.Data { |
| case "html": |
| // TODO(nigeltao): autoclose the stack of open elements. |
| return afterAfterBodyInsertionMode, true |
| default: |
| // TODO. |
| } |
| } |
| return afterBodyInsertionMode, true |
| } |
| |
| // Section 10.2.5.25. |
| func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) { |
| return inBodyInsertionMode, false |
| } |
| |
| // Parse returns the parse tree for the HTML from the given Reader. |
| // The input is assumed to be UTF-8 encoded. |
| func Parse(r io.Reader) (*Node, os.Error) { |
| p := &parser{ |
| tokenizer: NewTokenizer(r), |
| doc: &Node{ |
| Type: DocumentNode, |
| }, |
| scripting: true, |
| framesetOK: true, |
| } |
| // Iterate until EOF. Any other error will cause an early return. |
| im, consumed := initialInsertionMode, true |
| for { |
| if consumed { |
| if err := p.read(); err != nil { |
| if err == os.EOF { |
| break |
| } |
| return nil, err |
| } |
| } |
| im, consumed = im(p) |
| } |
| // Loop until the final token (the ErrorToken signifying EOF) is consumed. |
| for { |
| if im, consumed = im(p); consumed { |
| break |
| } |
| } |
| return p.doc, nil |
| } |