|
@@ -0,0 +1,201 @@
|
|
|
+package git.red-green.com/david/html-help
|
|
|
+
|
|
|
+import (
|
|
|
+ "fmt"
|
|
|
+ "io"
|
|
|
+ "strings"
|
|
|
+
|
|
|
+ "golang.org/x/net/html"
|
|
|
+)
|
|
|
+
|
|
|
+func RecursiveTree(output io.Writer, doc *html.Node, level int) {
|
|
|
+ var path *html.Node
|
|
|
+ var spaces string = strings.Repeat(" ", level*2)
|
|
|
+
|
|
|
+ for path = doc.FirstChild; path != nil; path = path.NextSibling {
|
|
|
+ switch path.Type {
|
|
|
+ case html.ElementNode:
|
|
|
+ output.Write([]byte(fmt.Sprintf("%s%s\n", spaces, path.Data)))
|
|
|
+ case html.TextNode:
|
|
|
+ output.Write([]byte(fmt.Sprintf("%sTEXT: %#v\n", spaces, path.Data)))
|
|
|
+ case html.DoctypeNode:
|
|
|
+ output.Write([]byte(fmt.Sprintf("%sDoctype: %s\n", spaces, path.Data)))
|
|
|
+ default:
|
|
|
+ output.Write([]byte(fmt.Sprintf("?? %#v\n", path)))
|
|
|
+ }
|
|
|
+
|
|
|
+ if path.FirstChild != nil {
|
|
|
+ RecursiveTree(output, path, level+1)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func OutputTree(output io.Writer, doc *html.Node) {
|
|
|
+ RecursiveTree(output, doc, 0)
|
|
|
+}
|
|
|
+
|
|
|
+// Find one matching Node
|
|
|
+func RecursiveFindOne(doc *html.Node, match func(doc *html.Node) bool) *html.Node {
|
|
|
+ var path *html.Node
|
|
|
+
|
|
|
+ for path = doc.FirstChild; path != nil; path = path.NextSibling {
|
|
|
+ if match(path) {
|
|
|
+ return path
|
|
|
+ }
|
|
|
+ var nested *html.Node = RecursiveFindOne(path, match)
|
|
|
+ if nested != nil {
|
|
|
+ return nested
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+// Find all matching Nodes
|
|
|
+func RecursiveFindAll(doc *html.Node, match func(doc *html.Node) bool) []*html.Node {
|
|
|
+ var path *html.Node
|
|
|
+ var results []*html.Node
|
|
|
+
|
|
|
+ for path = doc.FirstChild; path != nil; path = path.NextSibling {
|
|
|
+ if match(path) {
|
|
|
+ results = append(results, path)
|
|
|
+ }
|
|
|
+ var nested []*html.Node = RecursiveFindAll(path, match)
|
|
|
+ if len(nested) != 0 {
|
|
|
+ results = append(results, nested...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return results
|
|
|
+}
|
|
|
+
|
|
|
+func GetAttr(node *html.Node, attrKey string) (string, bool) {
|
|
|
+ var attr html.Attribute
|
|
|
+
|
|
|
+ for _, attr = range node.Attr {
|
|
|
+ if attr.Key == attrKey {
|
|
|
+ return attr.Val, true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return "", false
|
|
|
+}
|
|
|
+
|
|
|
+func HasAttr(node *html.Node, attrKey string, attrValue string) bool {
|
|
|
+ var attr html.Attribute
|
|
|
+
|
|
|
+ for _, attr = range node.Attr {
|
|
|
+ if attr.Key == attrKey {
|
|
|
+ var val string
|
|
|
+ for _, val = range strings.Split(attr.Val, " ") {
|
|
|
+ if val == attrValue {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+func MatchTag(tag string) func(*html.Node) bool {
|
|
|
+ return func(node *html.Node) bool {
|
|
|
+ return node.Type == html.ElementNode && node.Data == tag
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func MatchNestedTags(tags ...string) func(*html.Node) bool {
|
|
|
+ // reverse the order of the tags
|
|
|
+ for left, right := 0, len(tags)-1; left < right; left, right = left+1, right-1 {
|
|
|
+ tags[left], tags[right] = tags[right], tags[left]
|
|
|
+ }
|
|
|
+
|
|
|
+ return func(node *html.Node) bool {
|
|
|
+ var rev string
|
|
|
+ var walk *html.Node = node
|
|
|
+ for _, rev = range tags {
|
|
|
+ if walk.Type == html.ElementNode && walk.Data == rev {
|
|
|
+ // This never happens. Every html.ElementNode has a parent.
|
|
|
+ // html has parent of DocumentNode.
|
|
|
+ // This got coverage by altering the document: See TestNilParent.
|
|
|
+ if walk.Parent == nil {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ walk = walk.Parent
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func MatchTagAttr(tag string, attrKey string, attrValue string) func(*html.Node) bool {
|
|
|
+ return func(node *html.Node) bool {
|
|
|
+ if node.Type == html.ElementNode && node.Data == tag {
|
|
|
+ if HasAttr(node, attrKey, attrValue) {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func FindTagsAttr(doc *html.Node, tag string, attrKey string, attrValue string) []*html.Node {
|
|
|
+ var tagMatch func(*html.Node) bool
|
|
|
+ tagMatch = func(node *html.Node) bool {
|
|
|
+ if node.Type == html.ElementNode && node.Data == tag {
|
|
|
+ if HasAttr(node, attrKey, attrValue) {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ return RecursiveFindAll(doc, tagMatch)
|
|
|
+}
|
|
|
+
|
|
|
+func FindTags(doc *html.Node, tag string) []*html.Node {
|
|
|
+ var tagMatch func(*html.Node) bool
|
|
|
+ tagMatch = func(node *html.Node) bool {
|
|
|
+ return node.Type == html.ElementNode && node.Data == tag
|
|
|
+ }
|
|
|
+ return RecursiveFindAll(doc, tagMatch)
|
|
|
+}
|
|
|
+
|
|
|
+// GetText: can't tell if the text is part of the element, or follows
|
|
|
+// the element. This only gets the first child text node.
|
|
|
+// So something like: <div><b>Missing</b></div> for the div would be "".
|
|
|
+func GetText(node *html.Node) string {
|
|
|
+ node = node.FirstChild
|
|
|
+ if node != nil {
|
|
|
+ if node.Type == html.TextNode {
|
|
|
+ return node.Data
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ""
|
|
|
+}
|
|
|
+
|
|
|
+// GetAllText: collects all the text nodes within the given node.
|
|
|
+// <div><b>Text</b> Within</div> would return "Text Within"
|
|
|
+func GetAllText(node *html.Node) string {
|
|
|
+ var results string
|
|
|
+ if node.Type == html.TextNode {
|
|
|
+ results += node.Data
|
|
|
+ }
|
|
|
+ for node = node.FirstChild; node != nil; node = node.NextSibling {
|
|
|
+ results += GetAllText(node)
|
|
|
+ }
|
|
|
+ return results
|
|
|
+}
|
|
|
+
|
|
|
+func GetAllTextBR(node *html.Node) string {
|
|
|
+ var results string
|
|
|
+
|
|
|
+ if node.Type == html.TextNode {
|
|
|
+ results += node.Data
|
|
|
+ }
|
|
|
+ for node = node.FirstChild; node != nil; node = node.NextSibling {
|
|
|
+ if node.Type == html.ElementNode && node.Data == "br" {
|
|
|
+ results += "\n"
|
|
|
+ } else {
|
|
|
+ results += GetAllTextBR(node)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return results
|
|
|
+}
|