html-help.go 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. package htmlhelp
  2. import (
  3. "fmt"
  4. "io"
  5. "strings"
  6. "golang.org/x/net/html"
  7. )
  8. func RecursiveTree(output io.Writer, doc *html.Node, level int) {
  9. var path *html.Node
  10. var spaces string = strings.Repeat(" ", level*2)
  11. for path = doc.FirstChild; path != nil; path = path.NextSibling {
  12. switch path.Type {
  13. case html.ElementNode:
  14. output.Write([]byte(fmt.Sprintf("%s%s\n", spaces, path.Data)))
  15. case html.TextNode:
  16. output.Write([]byte(fmt.Sprintf("%sTEXT: %#v\n", spaces, path.Data)))
  17. case html.DoctypeNode:
  18. output.Write([]byte(fmt.Sprintf("%sDoctype: %s\n", spaces, path.Data)))
  19. default:
  20. output.Write([]byte(fmt.Sprintf("?? %#v\n", path)))
  21. }
  22. if path.FirstChild != nil {
  23. RecursiveTree(output, path, level+1)
  24. }
  25. }
  26. }
  27. func OutputTree(output io.Writer, doc *html.Node) {
  28. RecursiveTree(output, doc, 0)
  29. }
  30. // Find one matching Node
  31. func RecursiveFindOne(doc *html.Node, match func(doc *html.Node) bool) *html.Node {
  32. var path *html.Node
  33. for path = doc.FirstChild; path != nil; path = path.NextSibling {
  34. if match(path) {
  35. return path
  36. }
  37. var nested *html.Node = RecursiveFindOne(path, match)
  38. if nested != nil {
  39. return nested
  40. }
  41. }
  42. return nil
  43. }
  44. // Find all matching Nodes
  45. func RecursiveFindAll(doc *html.Node, match func(doc *html.Node) bool) []*html.Node {
  46. var path *html.Node
  47. var results []*html.Node
  48. for path = doc.FirstChild; path != nil; path = path.NextSibling {
  49. if match(path) {
  50. results = append(results, path)
  51. }
  52. var nested []*html.Node = RecursiveFindAll(path, match)
  53. if len(nested) != 0 {
  54. results = append(results, nested...)
  55. }
  56. }
  57. return results
  58. }
  59. func GetAttr(node *html.Node, attrKey string) (string, bool) {
  60. var attr html.Attribute
  61. for _, attr = range node.Attr {
  62. if attr.Key == attrKey {
  63. return attr.Val, true
  64. }
  65. }
  66. return "", false
  67. }
  68. func HasAttr(node *html.Node, attrKey string, attrValue string) bool {
  69. var attr html.Attribute
  70. for _, attr = range node.Attr {
  71. if attr.Key == attrKey {
  72. var val string
  73. for _, val = range strings.Split(attr.Val, " ") {
  74. if val == attrValue {
  75. return true
  76. }
  77. }
  78. }
  79. }
  80. return false
  81. }
  82. func MatchTag(tag string) func(*html.Node) bool {
  83. return func(node *html.Node) bool {
  84. return node.Type == html.ElementNode && node.Data == tag
  85. }
  86. }
  87. func MatchNestedTags(tags ...string) func(*html.Node) bool {
  88. // reverse the order of the tags
  89. for left, right := 0, len(tags)-1; left < right; left, right = left+1, right-1 {
  90. tags[left], tags[right] = tags[right], tags[left]
  91. }
  92. return func(node *html.Node) bool {
  93. var rev string
  94. var walk *html.Node = node
  95. for _, rev = range tags {
  96. if walk.Type == html.ElementNode && walk.Data == rev {
  97. // This never happens. Every html.ElementNode has a parent.
  98. // html has parent of DocumentNode.
  99. // This got coverage by altering the document: See TestNilParent.
  100. if walk.Parent == nil {
  101. return false
  102. }
  103. walk = walk.Parent
  104. } else {
  105. return false
  106. }
  107. }
  108. return true
  109. }
  110. }
  111. func MatchTagAttr(tag string, attrKey string, attrValue string) func(*html.Node) bool {
  112. return func(node *html.Node) bool {
  113. if node.Type == html.ElementNode && node.Data == tag {
  114. if HasAttr(node, attrKey, attrValue) {
  115. return true
  116. }
  117. }
  118. return false
  119. }
  120. }
  121. func FindTagsAttr(doc *html.Node, tag string, attrKey string, attrValue string) []*html.Node {
  122. var tagMatch func(*html.Node) bool
  123. tagMatch = func(node *html.Node) bool {
  124. if node.Type == html.ElementNode && node.Data == tag {
  125. if HasAttr(node, attrKey, attrValue) {
  126. return true
  127. }
  128. }
  129. return false
  130. }
  131. return RecursiveFindAll(doc, tagMatch)
  132. }
  133. func FindTags(doc *html.Node, tag string) []*html.Node {
  134. var tagMatch func(*html.Node) bool
  135. tagMatch = func(node *html.Node) bool {
  136. return node.Type == html.ElementNode && node.Data == tag
  137. }
  138. return RecursiveFindAll(doc, tagMatch)
  139. }
  140. // GetText: can't tell if the text is part of the element, or follows
  141. // the element. This only gets the first child text node.
  142. // So something like: <div><b>Missing</b></div> for the div would be "".
  143. func GetText(node *html.Node) string {
  144. node = node.FirstChild
  145. if node != nil {
  146. if node.Type == html.TextNode {
  147. return node.Data
  148. }
  149. }
  150. return ""
  151. }
  152. // GetAllText: collects all the text nodes within the given node.
  153. // <div><b>Text</b> Within</div> would return "Text Within"
  154. func GetAllText(node *html.Node) string {
  155. var results string
  156. if node.Type == html.TextNode {
  157. results += node.Data
  158. }
  159. for node = node.FirstChild; node != nil; node = node.NextSibling {
  160. results += GetAllText(node)
  161. }
  162. return results
  163. }
  164. func GetAllTextBR(node *html.Node) string {
  165. var results string
  166. if node.Type == html.TextNode {
  167. results += node.Data
  168. }
  169. for node = node.FirstChild; node != nil; node = node.NextSibling {
  170. if node.Type == html.ElementNode && node.Data == "br" {
  171. results += "\n"
  172. } else {
  173. results += GetAllTextBR(node)
  174. }
  175. }
  176. return results
  177. }