123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
- package htmlhelp
- import (
- "fmt"
- "io"
- "strings"
- "golang.org/x/net/html"
- )
- func RecursiveTree(output io.Writer, doc *html.Node, level int) {
- var path *html.Node
- var spaces string = strings.Repeat(" ", level*2)
- for path = doc.FirstChild; path != nil; path = path.NextSibling {
- switch path.Type {
- case html.ElementNode:
- output.Write([]byte(fmt.Sprintf("%s%s\n", spaces, path.Data)))
- case html.TextNode:
- output.Write([]byte(fmt.Sprintf("%sTEXT: %#v\n", spaces, path.Data)))
- case html.DoctypeNode:
- output.Write([]byte(fmt.Sprintf("%sDoctype: %s\n", spaces, path.Data)))
- default:
- output.Write([]byte(fmt.Sprintf("?? %#v\n", path)))
- }
- if path.FirstChild != nil {
- RecursiveTree(output, path, level+1)
- }
- }
- }
- func OutputTree(output io.Writer, doc *html.Node) {
- RecursiveTree(output, doc, 0)
- }
- // Find one matching Node
- func RecursiveFindOne(doc *html.Node, match func(doc *html.Node) bool) *html.Node {
- var path *html.Node
- for path = doc.FirstChild; path != nil; path = path.NextSibling {
- if match(path) {
- return path
- }
- var nested *html.Node = RecursiveFindOne(path, match)
- if nested != nil {
- return nested
- }
- }
- return nil
- }
- // Find all matching Nodes
- func RecursiveFindAll(doc *html.Node, match func(doc *html.Node) bool) []*html.Node {
- var path *html.Node
- var results []*html.Node
- for path = doc.FirstChild; path != nil; path = path.NextSibling {
- if match(path) {
- results = append(results, path)
- }
- var nested []*html.Node = RecursiveFindAll(path, match)
- if len(nested) != 0 {
- results = append(results, nested...)
- }
- }
- return results
- }
- func GetAttr(node *html.Node, attrKey string) (string, bool) {
- var attr html.Attribute
- for _, attr = range node.Attr {
- if attr.Key == attrKey {
- return attr.Val, true
- }
- }
- return "", false
- }
- func HasAttr(node *html.Node, attrKey string, attrValue string) bool {
- var attr html.Attribute
- for _, attr = range node.Attr {
- if attr.Key == attrKey {
- var val string
- for _, val = range strings.Split(attr.Val, " ") {
- if val == attrValue {
- return true
- }
- }
- }
- }
- return false
- }
- func MatchTag(tag string) func(*html.Node) bool {
- return func(node *html.Node) bool {
- return node.Type == html.ElementNode && node.Data == tag
- }
- }
- func MatchNestedTags(tags ...string) func(*html.Node) bool {
- // reverse the order of the tags
- for left, right := 0, len(tags)-1; left < right; left, right = left+1, right-1 {
- tags[left], tags[right] = tags[right], tags[left]
- }
- return func(node *html.Node) bool {
- var rev string
- var walk *html.Node = node
- for _, rev = range tags {
- if walk.Type == html.ElementNode && walk.Data == rev {
- // This never happens. Every html.ElementNode has a parent.
- // html has parent of DocumentNode.
- // This got coverage by altering the document: See TestNilParent.
- if walk.Parent == nil {
- return false
- }
- walk = walk.Parent
- } else {
- return false
- }
- }
- return true
- }
- }
- func MatchTagAttr(tag string, attrKey string, attrValue string) func(*html.Node) bool {
- return func(node *html.Node) bool {
- if node.Type == html.ElementNode && node.Data == tag {
- if HasAttr(node, attrKey, attrValue) {
- return true
- }
- }
- return false
- }
- }
- func FindTagsAttr(doc *html.Node, tag string, attrKey string, attrValue string) []*html.Node {
- var tagMatch func(*html.Node) bool
- tagMatch = func(node *html.Node) bool {
- if node.Type == html.ElementNode && node.Data == tag {
- if HasAttr(node, attrKey, attrValue) {
- return true
- }
- }
- return false
- }
- return RecursiveFindAll(doc, tagMatch)
- }
- func FindTags(doc *html.Node, tag string) []*html.Node {
- var tagMatch func(*html.Node) bool
- tagMatch = func(node *html.Node) bool {
- return node.Type == html.ElementNode && node.Data == tag
- }
- return RecursiveFindAll(doc, tagMatch)
- }
- // GetText: can't tell if the text is part of the element, or follows
- // the element. This only gets the first child text node.
- // So something like: <div><b>Missing</b></div> for the div would be "".
- func GetText(node *html.Node) string {
- node = node.FirstChild
- if node != nil {
- if node.Type == html.TextNode {
- return node.Data
- }
- }
- return ""
- }
- // GetAllText: collects all the text nodes within the given node.
- // <div><b>Text</b> Within</div> would return "Text Within"
- func GetAllText(node *html.Node) string {
- var results string
- if node.Type == html.TextNode {
- results += node.Data
- }
- for node = node.FirstChild; node != nil; node = node.NextSibling {
- results += GetAllText(node)
- }
- return results
- }
- func GetAllTextBR(node *html.Node) string {
- var results string
- if node.Type == html.TextNode {
- results += node.Data
- }
- for node = node.FirstChild; node != nil; node = node.NextSibling {
- if node.Type == html.ElementNode && node.Data == "br" {
- results += "\n"
- } else {
- results += GetAllTextBR(node)
- }
- }
- return results
- }
|