| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // This program takes an HTML file and outputs a corresponding article file in |
| // present format. See: golang.org/x/tools/present |
| package main // import "golang.org/x/tools/cmd/html2article" |
| |
| import ( |
| "bytes" |
| "errors" |
| "flag" |
| "fmt" |
| "io" |
| "log" |
| "net/url" |
| "os" |
| "regexp" |
| "strings" |
| |
| "golang.org/x/net/html" |
| "golang.org/x/net/html/atom" |
| ) |
| |
| func main() { |
| flag.Parse() |
| |
| err := convert(os.Stdout, os.Stdin) |
| if err != nil { |
| log.Fatal(err) |
| } |
| } |
| |
| func convert(w io.Writer, r io.Reader) error { |
| root, err := html.Parse(r) |
| if err != nil { |
| return err |
| } |
| |
| style := find(root, isTag(atom.Style)) |
| if err := parseStyles(style); err != nil { |
| log.Printf("couldn't parse all styles: %v", err) |
| } |
| |
| body := find(root, isTag(atom.Body)) |
| if body == nil { |
| return errors.New("couldn't find body") |
| } |
| article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body)))) |
| _, err = fmt.Fprintf(w, "Title\n\n%s", article) |
| return err |
| } |
| |
| type Style string |
| |
| const ( |
| Bold Style = "*" |
| Italic Style = "_" |
| Code Style = "`" |
| ) |
| |
| var cssRules = make(map[string]Style) |
| |
| func parseStyles(style *html.Node) error { |
| if style == nil || style.FirstChild == nil { |
| return errors.New("couldn't find styles") |
| } |
| |
| styles := style.FirstChild.Data |
| readUntil := func(end rune) (string, bool) { |
| i := strings.IndexRune(styles, end) |
| if i < 0 { |
| return "", false |
| } |
| s := styles[:i] |
| styles = styles[i:] |
| return s, true |
| } |
| |
| for { |
| sel, ok := readUntil('{') |
| if !ok && sel == "" { |
| break |
| } else if !ok { |
| return fmt.Errorf("could not parse selector %q", styles) |
| } |
| |
| value, ok := readUntil('}') |
| if !ok { |
| return fmt.Errorf("couldn't parse style body for %s", sel) |
| } |
| switch { |
| case strings.Contains(value, "italic"): |
| cssRules[sel] = Italic |
| case strings.Contains(value, "bold"): |
| cssRules[sel] = Bold |
| case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"): |
| cssRules[sel] = Code |
| } |
| } |
| return nil |
| } |
| |
| var newlineRun = regexp.MustCompile(`\n\n+`) |
| |
| func limitNewlineRuns(s string) string { |
| return newlineRun.ReplaceAllString(s, "\n\n") |
| } |
| |
| func makeHeadings(body string) string { |
| buf := new(bytes.Buffer) |
| lines := strings.Split(body, "\n") |
| for i, s := range lines { |
| if i == 0 && !isBoldTitle(s) { |
| buf.WriteString("* Introduction\n\n") |
| } |
| if isBoldTitle(s) { |
| s = strings.TrimSpace(strings.Replace(s, "*", " ", -1)) |
| s = "* " + s |
| } |
| buf.WriteString(s) |
| buf.WriteByte('\n') |
| } |
| return buf.String() |
| } |
| |
| func isBoldTitle(s string) bool { |
| return !strings.Contains(s, " ") && |
| strings.HasPrefix(s, "*") && |
| strings.HasSuffix(s, "*") |
| } |
| |
| func indent(buf *bytes.Buffer, s string) { |
| for _, l := range strings.Split(s, "\n") { |
| if l != "" { |
| buf.WriteByte('\t') |
| buf.WriteString(l) |
| } |
| buf.WriteByte('\n') |
| } |
| } |
| |
| func unwrap(buf *bytes.Buffer, s string) { |
| var cont bool |
| for _, l := range strings.Split(s, "\n") { |
| l = strings.TrimSpace(l) |
| if len(l) == 0 { |
| if cont { |
| buf.WriteByte('\n') |
| buf.WriteByte('\n') |
| } |
| cont = false |
| } else { |
| if cont { |
| buf.WriteByte(' ') |
| } |
| buf.WriteString(l) |
| cont = true |
| } |
| } |
| } |
| |
| func text(n *html.Node) string { |
| var buf bytes.Buffer |
| walk(n, func(n *html.Node) bool { |
| switch n.Type { |
| case html.TextNode: |
| buf.WriteString(n.Data) |
| return false |
| case html.ElementNode: |
| // no-op |
| default: |
| return true |
| } |
| a := n.DataAtom |
| if a == atom.Span { |
| switch { |
| case hasStyle(Code)(n): |
| a = atom.Code |
| case hasStyle(Bold)(n): |
| a = atom.B |
| case hasStyle(Italic)(n): |
| a = atom.I |
| } |
| } |
| switch a { |
| case atom.Br: |
| buf.WriteByte('\n') |
| case atom.P: |
| unwrap(&buf, childText(n)) |
| buf.WriteString("\n\n") |
| case atom.Li: |
| buf.WriteString("- ") |
| unwrap(&buf, childText(n)) |
| buf.WriteByte('\n') |
| case atom.Pre: |
| indent(&buf, childText(n)) |
| buf.WriteByte('\n') |
| case atom.A: |
| href, text := attr(n, "href"), childText(n) |
| // Skip links with no text. |
| if strings.TrimSpace(text) == "" { |
| break |
| } |
| // Don't emit empty links. |
| if strings.TrimSpace(href) == "" { |
| buf.WriteString(text) |
| break |
| } |
| // Use original url for Google Docs redirections. |
| if u, err := url.Parse(href); err != nil { |
| log.Printf("parsing url %q: %v", href, err) |
| } else if u.Host == "www.google.com" && u.Path == "/url" { |
| href = u.Query().Get("q") |
| } |
| fmt.Fprintf(&buf, "[[%s][%s]]", href, text) |
| case atom.Code: |
| buf.WriteString(highlight(n, "`")) |
| case atom.B: |
| buf.WriteString(highlight(n, "*")) |
| case atom.I: |
| buf.WriteString(highlight(n, "_")) |
| case atom.Img: |
| src := attr(n, "src") |
| fmt.Fprintf(&buf, ".image %s\n", src) |
| case atom.Iframe: |
| src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height") |
| fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w) |
| case atom.Param: |
| if attr(n, "name") == "movie" { |
| // Old style YouTube embed. |
| u := attr(n, "value") |
| u = strings.Replace(u, "/v/", "/embed/", 1) |
| if i := strings.Index(u, "&"); i >= 0 { |
| u = u[:i] |
| } |
| fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) |
| } |
| case atom.Title: |
| default: |
| return true |
| } |
| return false |
| }) |
| return buf.String() |
| } |
| |
| func childText(node *html.Node) string { |
| var buf bytes.Buffer |
| for n := node.FirstChild; n != nil; n = n.NextSibling { |
| fmt.Fprint(&buf, text(n)) |
| } |
| return buf.String() |
| } |
| |
| func highlight(node *html.Node, char string) string { |
| t := strings.Replace(childText(node), " ", char, -1) |
| return fmt.Sprintf("%s%s%s", char, t, char) |
| } |
| |
| type selector func(*html.Node) bool |
| |
| func isTag(a atom.Atom) selector { |
| return func(n *html.Node) bool { |
| return n.DataAtom == a |
| } |
| } |
| |
| func hasClass(name string) selector { |
| return func(n *html.Node) bool { |
| for _, a := range n.Attr { |
| if a.Key == "class" { |
| for _, c := range strings.Fields(a.Val) { |
| if c == name { |
| return true |
| } |
| } |
| } |
| } |
| return false |
| } |
| } |
| |
| func hasStyle(s Style) selector { |
| return func(n *html.Node) bool { |
| for rule, s2 := range cssRules { |
| if s2 != s { |
| continue |
| } |
| if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) { |
| return true |
| } |
| if n.DataAtom.String() == rule { |
| return true |
| } |
| } |
| return false |
| } |
| } |
| |
| func attr(node *html.Node, key string) (value string) { |
| for _, attr := range node.Attr { |
| if attr.Key == key { |
| return attr.Val |
| } |
| } |
| return "" |
| } |
| |
| func find(n *html.Node, fn selector) *html.Node { |
| var result *html.Node |
| walk(n, func(n *html.Node) bool { |
| if result != nil { |
| return false |
| } |
| if fn(n) { |
| result = n |
| return false |
| } |
| return true |
| }) |
| return result |
| } |
| |
| func walk(n *html.Node, fn selector) { |
| if fn(n) { |
| for c := n.FirstChild; c != nil; c = c.NextSibling { |
| walk(c, fn) |
| } |
| } |
| } |