html: improve parsing of tables
When foster parenting, merge adjacent text nodes.
Properly close table row at </tr> tag.
Pass tests1.dat, test 32:
<!-----><font><div>hello<table>excite!<b>me!<th><i>please!</tr><!--X-->
| <!-- - -->
| <html>
| <head>
| <body>
| <font>
| <div>
| "helloexcite!"
| <b>
| "me!"
| <table>
| <tbody>
| <tr>
| <th>
| <i>
| "please!"
| <!-- X -->
R=nigeltao
CC=golang-dev
https://golang.org/cl/5323048
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
index d1d4e48..292fbaf 100644
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@@ -52,6 +52,11 @@
tableScopeStopTags = []string{"html", "table"}
)
+// stopTags for use in clearStackToContext.
+var (
+ tableRowContextStopTags = []string{"tr", "html"}
+)
+
// popUntil pops the stack of open elements at the highest element whose tag
// is in matchTags, provided there is no higher element in stopTags. It returns
// whether or not there was such an element. If there was not, popUntil leaves
@@ -146,6 +151,11 @@
}
}
+ if i > 0 && parent.Child[i-1].Type == TextNode && n.Type == TextNode {
+ parent.Child[i-1].Data += n.Data
+ return
+ }
+
if i == len(parent.Child) {
parent.Add(n)
} else {
@@ -749,11 +759,11 @@
case StartTagToken:
switch p.tok.Data {
case "tbody", "tfoot", "thead":
- p.clearStackToTableContext()
+ p.clearStackToContext(tableScopeStopTags)
p.addElement(p.tok.Data, p.tok.Attr)
return inTableBodyIM, true
case "td", "th", "tr":
- p.clearStackToTableContext()
+ p.clearStackToContext(tableScopeStopTags)
p.addElement("tbody", nil)
return inTableBodyIM, false
case "table":
@@ -794,11 +804,15 @@
return useTheRulesFor(p, inTableIM, inBodyIM)
}
-func (p *parser) clearStackToTableContext() {
+// clearStackToContext pops elements off the stack of open elements
+// until an element listed in stopTags is found.
+func (p *parser) clearStackToContext(stopTags []string) {
for i := len(p.oe) - 1; i >= 0; i-- {
- if x := p.oe[i].Data; x == "table" || x == "html" {
- p.oe = p.oe[:i+1]
- return
+ for _, tag := range stopTags {
+ if p.oe[i].Data == tag {
+ p.oe = p.oe[:i+1]
+ return
+ }
}
}
}
@@ -877,7 +891,12 @@
case EndTagToken:
switch p.tok.Data {
case "tr":
- // TODO.
+ if !p.elementInScope(tableScopeStopTags, "tr") {
+ return inRowIM, true
+ }
+ p.clearStackToContext(tableRowContextStopTags)
+ p.oe.pop()
+ return inTableBodyIM, true
case "table":
if p.popUntil(tableScopeStopTags, "tr") {
return inTableBodyIM, false
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go
index beba98d..865a47d 100644
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
@@ -132,7 +132,7 @@
rc := make(chan io.Reader)
go readDat(filename, rc)
// TODO(nigeltao): Process all test cases, not just a subset.
- for i := 0; i < 32; i++ {
+ for i := 0; i < 33; i++ {
// Parse the #data section.
b, err := ioutil.ReadAll(<-rc)
if err != nil {