parse.rs 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. use anyhow::{bail, Result};
  2. use scraper;
  3. use scraper::Element;
  4. use regex::Regex;
  5. pub fn find_versions(html: &String) -> Result<String> {
  6. let document = scraper::Html::parse_document(&html);
  7. let select_a = scraper::Selector::parse("a").unwrap();
  8. let version_match = Regex::new(r#"^[0-9.]+$"#).unwrap();
  9. for a in document.select(&select_a) {
  10. let text = element_text(a);
  11. if version_match.is_match(&text) {
  12. return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text));
  13. }
  14. }
  15. bail!("Could not locate a version string.");
  16. }
  17. #[allow(dead_code)]
  18. /// Find next sibling element.
  19. fn next_element(element: scraper::ElementRef<'_>) -> Result<scraper::ElementRef<'_>> {
  20. let next_node = element.next_sibling_element();
  21. if let Some(node) = next_node {
  22. return Ok(node);
  23. }
  24. bail!("No more elements.");
  25. }
  26. pub struct VerseOfDay {
  27. pub date: String,
  28. pub verse: String,
  29. pub reference: String,
  30. }
  31. fn element_text(element: scraper::ElementRef<'_>) -> String {
  32. let text = element
  33. .text()
  34. .map(|s| s.trim_matches(char::is_whitespace))
  35. .filter(|x| !x.is_empty())
  36. .collect::<String>();
  37. text
  38. }
  39. pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
  40. let document = scraper::Html::parse_document(&html);
  41. // let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap();
  42. let mut result: Vec<VerseOfDay> = Vec::new();
  43. // How about this?
  44. /*
  45. This was build by looking at the structure of the HTML.
  46. What I looked for, was, something that would contain all of the items I was
  47. interested in. Select it in the Web Developer tool. When everything you
  48. want is highlighted in the browser page, that's the tag you want.
  49. In this case, it was main div div div div. Tag p contained the date.
  50. Tags a in a div[class="mbs-2"] had verse and reference.
  51. */
  52. {
  53. // Locate the Verse of the Day div tag.
  54. let vod_div_select = scraper::Selector::parse("main>div>div>div>div").unwrap();
  55. if let Some(vod_div) = document.select(&vod_div_select).next() {
  56. // Ok, search just in this div for things of interest.
  57. /*
  58. // h1 text is "Verse of the Day"
  59. let h1_select = scraper::Selector::parse("h1").unwrap();
  60. let h1 = vod_div.select(&h1_select).next().unwrap();
  61. println!("h1 = {}", element_text(h1)); //h1.text().collect::<Vec<_>>());
  62. */
  63. let p_select = scraper::Selector::parse("p").unwrap();
  64. let p = vod_div.select(&p_select).next().unwrap();
  65. // println!("p = {}", element_text(p)); // p.text().collect::<Vec<_>>());
  66. let a_select = scraper::Selector::parse(r#"div[class~="mbs-2"]>a"#).unwrap();
  67. let mut verse_info = vod_div
  68. .select(&a_select)
  69. .map(|a| element_text(a))
  70. .collect::<Vec<String>>();
  71. if verse_info.len() == 2 {
  72. result.push(VerseOfDay {
  73. date: element_text(p),
  74. verse: verse_info.remove(0),
  75. reference: verse_info.remove(0),
  76. });
  77. } else {
  78. bail!("Unable to locate today's verse. Has the HTML changed?");
  79. }
  80. /*
  81. for a in vod_div.select(&a_select) {
  82. println!("a = {}", element_text(a)); // a.text().collect::<Vec<_>>());
  83. }
  84. */
  85. }
  86. }
  87. // Previous ones are in div[class="mlb-2"]
  88. let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap();
  89. let a_selector1 =
  90. scraper::Selector::parse(r#"a[href^="/bible/"][class~="no-underline"]"#).unwrap();
  91. let p_selector = scraper::Selector::parse("div>p").unwrap();
  92. println!("=====");
  93. for prev_div in document.select(&prev_div_selector) {
  94. if let Some(p) = prev_div.select(&p_selector).next() {
  95. let mut verse_info = prev_div
  96. .select(&a_selector1)
  97. .map(|a| element_text(a))
  98. .collect::<Vec<String>>();
  99. if verse_info.len() == 2 {
  100. result.push(VerseOfDay {
  101. date: element_text(p),
  102. verse: verse_info.remove(0),
  103. reference: verse_info.remove(0),
  104. });
  105. }
  106. // println!("{}", element_text(p)); // p.text().collect::<Vec<_>>());
  107. }
  108. }
  109. Ok(result)
  110. }
  111. pub fn find_next_chapter(html: &String) -> Result<String> {
  112. let document = scraper::Html::parse_document(html);
  113. // let a_selector = scraper::Selector::parse("div>a").unwrap();
  114. // This one works: (starts with "/bible/").
  115. // let a_selector = scraper::Selector::parse(r#"div>a[href ^= "/bible/"]"#).unwrap();
  116. // This one fails to find what we're looking for. Contains /bible/ or "bible" both fail.
  117. // Ok, using href~="/bible/" fails. It looks for the WORD "/bible/".
  118. // Using *= finds it anywhere. Ah HA!
  119. let a_selector = scraper::Selector::parse(r#"div>a[href*="/bible/"]"#).unwrap();
  120. for a in document.select(&a_selector) {
  121. if a.attr("class").is_some() {
  122. continue;
  123. }
  124. // Since the selector finds href containing /bible/, I don't need some of these tests now.
  125. // I still need this one, so I have the href value.
  126. if let Some(href) = a.attr("href") {
  127. // if href.contains("/bible/") {
  128. // let href_absolute = relative_to_absolute(url, href)?;
  129. let text = a
  130. .text()
  131. .map(|s| {
  132. s.trim_matches(char::is_whitespace) // &[' ', '\n', '\t'])
  133. })
  134. .filter(|x| {
  135. !x.is_empty()
  136. // x.chars().any(|c| (c != ' ') && (c != '\n'))
  137. })
  138. .collect::<Vec<_>>();
  139. // println!("TEXT: {:?}", text);
  140. if text.len() != 1 {
  141. continue;
  142. }
  143. if text[0] != "Next Chapter" {
  144. // println!("Found: [{:?}]", text[0]);
  145. continue;
  146. }
  147. return Ok(href.to_string());
  148. // } else {
  149. // println!("href contains: [{}]", href);
  150. // }
  151. }
  152. }
  153. bail!("Next Chapter not found.");
  154. }
  155. #[cfg(test)]
  156. mod tests {
  157. use super::*;
  158. /// Test HTML as given to us by the website.
  159. #[test]
  160. fn chapter_test() {
  161. let html = String::from(
  162. r#"<div class="[pointer-events:all]"><a href="/bible/59/GEN.2.ESV"><div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5"><svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25"><title id="Next Chapter">Next Chapter</title><path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor"></path></svg></div></a></div>"#,
  163. );
  164. let r = find_next_chapter(&html);
  165. if !r.is_ok() {
  166. println!("DEBUG result = {:?}", r);
  167. }
  168. assert!(r.is_ok());
  169. let link = r.unwrap();
  170. assert_eq!(link, "/bible/59/GEN.2.ESV");
  171. }
  172. /// This tests when the HTML has been tidied.
  173. ///
  174. /// HTML has newlines and spaces added, rather then condensed.
  175. #[test]
  176. fn chapter_test_tidy() {
  177. let html = String::from(
  178. r#"<div>
  179. <a href="/bible/59/GEN.2.ESV">
  180. <div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5">
  181. <svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25">
  182. <title id="Next Chapter">Next Chapter</title>
  183. <path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor">
  184. </path>
  185. </svg>
  186. </div>
  187. </a>
  188. </div>"#,
  189. );
  190. let r = find_next_chapter(&html);
  191. if !r.is_ok() {
  192. println!("DEBUG result = {:?}", r);
  193. }
  194. assert!(r.is_ok());
  195. let link = r.unwrap();
  196. assert_eq!(link, "/bible/59/GEN.2.ESV");
  197. }
  198. }