use anyhow::{bail, Result}; use scraper; use scraper::Element; use regex::Regex; pub fn find_versions(html: &String) -> Result { let document = scraper::Html::parse_document(&html); let select_a = scraper::Selector::parse("a").unwrap(); let version_match = Regex::new(r#"^[0-9.]+$"#).unwrap(); for a in document.select(&select_a) { let text = element_text(a); if version_match.is_match(&text) { return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text)); } } bail!("Could not locate a version string."); } #[allow(dead_code)] /// Find next sibling element. fn next_element(element: scraper::ElementRef<'_>) -> Result> { let next_node = element.next_sibling_element(); if let Some(node) = next_node { return Ok(node); } bail!("No more elements."); } pub struct VerseOfDay { pub date: String, pub verse: String, pub reference: String, } fn element_text(element: scraper::ElementRef<'_>) -> String { let text = element .text() .map(|s| s.trim_matches(char::is_whitespace)) .filter(|x| !x.is_empty()) .collect::(); text } pub fn find_vod(html: &String) -> Result> { let document = scraper::Html::parse_document(&html); // let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap(); let mut result: Vec = Vec::new(); // How about this? /* This was build by looking at the structure of the HTML. What I looked for, was, something that would contain all of the items I was interested in. Select it in the Web Developer tool. When everything you want is highlighted in the browser page, that's the tag you want. In this case, it was main div div div div. Tag p contained the date. Tags a in a div[class="mbs-2"] had verse and reference. */ { // Locate the Verse of the Day div tag. let vod_div_select = scraper::Selector::parse("main>div>div>div>div").unwrap(); if let Some(vod_div) = document.select(&vod_div_select).next() { // Ok, search just in this div for things of interest. /* // h1 text is "Verse of the Day" let h1_select = scraper::Selector::parse("h1").unwrap(); let h1 = vod_div.select(&h1_select).next().unwrap(); println!("h1 = {}", element_text(h1)); //h1.text().collect::>()); */ let p_select = scraper::Selector::parse("p").unwrap(); let p = vod_div.select(&p_select).next().unwrap(); // println!("p = {}", element_text(p)); // p.text().collect::>()); let a_select = scraper::Selector::parse(r#"div[class~="mbs-2"]>a"#).unwrap(); let mut verse_info = vod_div .select(&a_select) .map(|a| element_text(a)) .collect::>(); if verse_info.len() == 2 { result.push(VerseOfDay { date: element_text(p), verse: verse_info.remove(0), reference: verse_info.remove(0), }); } else { bail!("Unable to locate today's verse. Has the HTML changed?"); } /* for a in vod_div.select(&a_select) { println!("a = {}", element_text(a)); // a.text().collect::>()); } */ } } // Previous ones are in div[class="mlb-2"] let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap(); let a_selector1 = scraper::Selector::parse(r#"a[href^="/bible/"][class~="no-underline"]"#).unwrap(); let p_selector = scraper::Selector::parse("div>p").unwrap(); println!("====="); for prev_div in document.select(&prev_div_selector) { if let Some(p) = prev_div.select(&p_selector).next() { let mut verse_info = prev_div .select(&a_selector1) .map(|a| element_text(a)) .collect::>(); if verse_info.len() == 2 { result.push(VerseOfDay { date: element_text(p), verse: verse_info.remove(0), reference: verse_info.remove(0), }); } // println!("{}", element_text(p)); // p.text().collect::>()); } } Ok(result) } pub fn find_next_chapter(html: &String) -> Result { let document = scraper::Html::parse_document(html); // let a_selector = scraper::Selector::parse("div>a").unwrap(); // This one works: (starts with "/bible/"). // let a_selector = scraper::Selector::parse(r#"div>a[href ^= "/bible/"]"#).unwrap(); // This one fails to find what we're looking for. Contains /bible/ or "bible" both fail. // Ok, using href~="/bible/" fails. It looks for the WORD "/bible/". // Using *= finds it anywhere. Ah HA! let a_selector = scraper::Selector::parse(r#"div>a[href*="/bible/"]"#).unwrap(); for a in document.select(&a_selector) { if a.attr("class").is_some() { continue; } // Since the selector finds href containing /bible/, I don't need some of these tests now. // I still need this one, so I have the href value. if let Some(href) = a.attr("href") { // if href.contains("/bible/") { // let href_absolute = relative_to_absolute(url, href)?; let text = a .text() .map(|s| { s.trim_matches(char::is_whitespace) // &[' ', '\n', '\t']) }) .filter(|x| { !x.is_empty() // x.chars().any(|c| (c != ' ') && (c != '\n')) }) .collect::>(); // println!("TEXT: {:?}", text); if text.len() != 1 { continue; } if text[0] != "Next Chapter" { // println!("Found: [{:?}]", text[0]); continue; } return Ok(href.to_string()); // } else { // println!("href contains: [{}]", href); // } } } bail!("Next Chapter not found."); } #[cfg(test)] mod tests { use super::*; /// Test HTML as given to us by the website. #[test] fn chapter_test() { let html = String::from( r#""#, ); let r = find_next_chapter(&html); if !r.is_ok() { println!("DEBUG result = {:?}", r); } assert!(r.is_ok()); let link = r.unwrap(); assert_eq!(link, "/bible/59/GEN.2.ESV"); } /// This tests when the HTML has been tidied. /// /// HTML has newlines and spaces added, rather then condensed. #[test] fn chapter_test_tidy() { let html = String::from( r#""#, ); let r = find_next_chapter(&html); if !r.is_ok() { println!("DEBUG result = {:?}", r); } assert!(r.is_ok()); let link = r.unwrap(); assert_eq!(link, "/bible/59/GEN.2.ESV"); } }