package htmlhelp import ( "fmt" "io" "strings" "golang.org/x/net/html" ) func RecursiveTree(output io.Writer, doc *html.Node, level int) { var path *html.Node var spaces string = strings.Repeat(" ", level*2) for path = doc.FirstChild; path != nil; path = path.NextSibling { switch path.Type { case html.ElementNode: output.Write([]byte(fmt.Sprintf("%s%s\n", spaces, path.Data))) case html.TextNode: output.Write([]byte(fmt.Sprintf("%sTEXT: %#v\n", spaces, path.Data))) case html.DoctypeNode: output.Write([]byte(fmt.Sprintf("%sDoctype: %s\n", spaces, path.Data))) default: output.Write([]byte(fmt.Sprintf("?? %#v\n", path))) } if path.FirstChild != nil { RecursiveTree(output, path, level+1) } } } func OutputTree(output io.Writer, doc *html.Node) { RecursiveTree(output, doc, 0) } // Find one matching Node func RecursiveFindOne(doc *html.Node, match func(doc *html.Node) bool) *html.Node { var path *html.Node for path = doc.FirstChild; path != nil; path = path.NextSibling { if match(path) { return path } var nested *html.Node = RecursiveFindOne(path, match) if nested != nil { return nested } } return nil } // Find all matching Nodes func RecursiveFindAll(doc *html.Node, match func(doc *html.Node) bool) []*html.Node { var path *html.Node var results []*html.Node for path = doc.FirstChild; path != nil; path = path.NextSibling { if match(path) { results = append(results, path) } var nested []*html.Node = RecursiveFindAll(path, match) if len(nested) != 0 { results = append(results, nested...) } } return results } func GetAttr(node *html.Node, attrKey string) (string, bool) { var attr html.Attribute for _, attr = range node.Attr { if attr.Key == attrKey { return attr.Val, true } } return "", false } func HasAttr(node *html.Node, attrKey string, attrValue string) bool { var attr html.Attribute for _, attr = range node.Attr { if attr.Key == attrKey { var val string for _, val = range strings.Split(attr.Val, " ") { if val == attrValue { return true } } } } return false } func MatchTag(tag string) func(*html.Node) bool { return func(node *html.Node) bool { return node.Type == html.ElementNode && node.Data == tag } } func MatchNestedTags(tags ...string) func(*html.Node) bool { // reverse the order of the tags for left, right := 0, len(tags)-1; left < right; left, right = left+1, right-1 { tags[left], tags[right] = tags[right], tags[left] } return func(node *html.Node) bool { var rev string var walk *html.Node = node for _, rev = range tags { if walk.Type == html.ElementNode && walk.Data == rev { // This never happens. Every html.ElementNode has a parent. // html has parent of DocumentNode. // This got coverage by altering the document: See TestNilParent. if walk.Parent == nil { return false } walk = walk.Parent } else { return false } } return true } } func MatchTagAttr(tag string, attrKey string, attrValue string) func(*html.Node) bool { return func(node *html.Node) bool { if node.Type == html.ElementNode && node.Data == tag { if HasAttr(node, attrKey, attrValue) { return true } } return false } } func FindTagsAttr(doc *html.Node, tag string, attrKey string, attrValue string) []*html.Node { var tagMatch func(*html.Node) bool tagMatch = func(node *html.Node) bool { if node.Type == html.ElementNode && node.Data == tag { if HasAttr(node, attrKey, attrValue) { return true } } return false } return RecursiveFindAll(doc, tagMatch) } func FindTags(doc *html.Node, tag string) []*html.Node { var tagMatch func(*html.Node) bool tagMatch = func(node *html.Node) bool { return node.Type == html.ElementNode && node.Data == tag } return RecursiveFindAll(doc, tagMatch) } // GetText: can't tell if the text is part of the element, or follows // the element. This only gets the first child text node. // So something like:
Missing
for the div would be "". func GetText(node *html.Node) string { node = node.FirstChild if node != nil { if node.Type == html.TextNode { return node.Data } } return "" } // GetAllText: collects all the text nodes within the given node. //
Text Within
would return "Text Within" func GetAllText(node *html.Node) string { var results string if node.Type == html.TextNode { results += node.Data } for node = node.FirstChild; node != nil; node = node.NextSibling { results += GetAllText(node) } return results } func GetAllTextBR(node *html.Node) string { var results string if node.Type == html.TextNode { results += node.Data } for node = node.FirstChild; node != nil; node = node.NextSibling { if node.Type == html.ElementNode && node.Data == "br" { results += "\n" } else { results += GetAllTextBR(node) } } return results }