123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223 |
- use anyhow::{bail, Result};
- use scraper;
- use scraper::Element;
- use regex::Regex;
- pub fn find_versions(html: &String) -> Result<String> {
- let document = scraper::Html::parse_document(&html);
- let select_a = scraper::Selector::parse("a").unwrap();
- let version_match = Regex::new(r#"^[0-9.]+$"#).unwrap();
- for a in document.select(&select_a) {
- let text = element_text(a);
- if version_match.is_match(&text) {
- return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text));
- }
- }
- bail!("Could not locate a version string.");
- }
- #[allow(dead_code)]
- /// Find next sibling element.
- fn next_element(element: scraper::ElementRef<'_>) -> Result<scraper::ElementRef<'_>> {
- let next_node = element.next_sibling_element();
- if let Some(node) = next_node {
- return Ok(node);
- }
- bail!("No more elements.");
- }
- pub struct VerseOfDay {
- pub date: String,
- pub verse: String,
- pub reference: String,
- }
- fn element_text(element: scraper::ElementRef<'_>) -> String {
- let text = element
- .text()
- .map(|s| s.trim_matches(char::is_whitespace))
- .filter(|x| !x.is_empty())
- .collect::<String>();
- text
- }
- pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
- let document = scraper::Html::parse_document(&html);
- // let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap();
- let mut result: Vec<VerseOfDay> = Vec::new();
- // How about this?
- /*
- This was build by looking at the structure of the HTML.
- What I looked for, was, something that would contain all of the items I was
- interested in. Select it in the Web Developer tool. When everything you
- want is highlighted in the browser page, that's the tag you want.
- In this case, it was main div div div div. Tag p contained the date.
- Tags a in a div[class="mbs-2"] had verse and reference.
- */
- {
- // Locate the Verse of the Day div tag.
- let vod_div_select = scraper::Selector::parse("main>div>div>div>div").unwrap();
- if let Some(vod_div) = document.select(&vod_div_select).next() {
- // Ok, search just in this div for things of interest.
- /*
- // h1 text is "Verse of the Day"
- let h1_select = scraper::Selector::parse("h1").unwrap();
- let h1 = vod_div.select(&h1_select).next().unwrap();
- println!("h1 = {}", element_text(h1)); //h1.text().collect::<Vec<_>>());
- */
- let p_select = scraper::Selector::parse("p").unwrap();
- let p = vod_div.select(&p_select).next().unwrap();
- // println!("p = {}", element_text(p)); // p.text().collect::<Vec<_>>());
- let a_select = scraper::Selector::parse(r#"div[class~="mbs-2"]>a"#).unwrap();
- let mut verse_info = vod_div
- .select(&a_select)
- .map(|a| element_text(a))
- .collect::<Vec<String>>();
- if verse_info.len() == 2 {
- result.push(VerseOfDay {
- date: element_text(p),
- verse: verse_info.remove(0),
- reference: verse_info.remove(0),
- });
- } else {
- bail!("Unable to locate today's verse. Has the HTML changed?");
- }
- /*
- for a in vod_div.select(&a_select) {
- println!("a = {}", element_text(a)); // a.text().collect::<Vec<_>>());
- }
- */
- }
- }
- // Previous ones are in div[class="mlb-2"]
- let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap();
- let a_selector1 =
- scraper::Selector::parse(r#"a[href^="/bible/"][class~="no-underline"]"#).unwrap();
- let p_selector = scraper::Selector::parse("div>p").unwrap();
- println!("=====");
- for prev_div in document.select(&prev_div_selector) {
- if let Some(p) = prev_div.select(&p_selector).next() {
- let mut verse_info = prev_div
- .select(&a_selector1)
- .map(|a| element_text(a))
- .collect::<Vec<String>>();
- if verse_info.len() == 2 {
- result.push(VerseOfDay {
- date: element_text(p),
- verse: verse_info.remove(0),
- reference: verse_info.remove(0),
- });
- }
- // println!("{}", element_text(p)); // p.text().collect::<Vec<_>>());
- }
- }
- Ok(result)
- }
- pub fn find_next_chapter(html: &String) -> Result<String> {
- let document = scraper::Html::parse_document(html);
- // let a_selector = scraper::Selector::parse("div>a").unwrap();
- // This one works: (starts with "/bible/").
- // let a_selector = scraper::Selector::parse(r#"div>a[href ^= "/bible/"]"#).unwrap();
- // This one fails to find what we're looking for. Contains /bible/ or "bible" both fail.
- // Ok, using href~="/bible/" fails. It looks for the WORD "/bible/".
- // Using *= finds it anywhere. Ah HA!
- let a_selector = scraper::Selector::parse(r#"div>a[href*="/bible/"]"#).unwrap();
- for a in document.select(&a_selector) {
- if a.attr("class").is_some() {
- continue;
- }
- // Since the selector finds href containing /bible/, I don't need some of these tests now.
- // I still need this one, so I have the href value.
- if let Some(href) = a.attr("href") {
- // if href.contains("/bible/") {
- // let href_absolute = relative_to_absolute(url, href)?;
- let text = a
- .text()
- .map(|s| {
- s.trim_matches(char::is_whitespace) // &[' ', '\n', '\t'])
- })
- .filter(|x| {
- !x.is_empty()
- // x.chars().any(|c| (c != ' ') && (c != '\n'))
- })
- .collect::<Vec<_>>();
- // println!("TEXT: {:?}", text);
- if text.len() != 1 {
- continue;
- }
- if text[0] != "Next Chapter" {
- // println!("Found: [{:?}]", text[0]);
- continue;
- }
- return Ok(href.to_string());
- // } else {
- // println!("href contains: [{}]", href);
- // }
- }
- }
- bail!("Next Chapter not found.");
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- /// Test HTML as given to us by the website.
- #[test]
- fn chapter_test() {
- let html = String::from(
- r#"<div class="[pointer-events:all]"><a href="/bible/59/GEN.2.ESV"><div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5"><svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25"><title id="Next Chapter">Next Chapter</title><path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor"></path></svg></div></a></div>"#,
- );
- let r = find_next_chapter(&html);
- if !r.is_ok() {
- println!("DEBUG result = {:?}", r);
- }
- assert!(r.is_ok());
- let link = r.unwrap();
- assert_eq!(link, "/bible/59/GEN.2.ESV");
- }
- /// This tests when the HTML has been tidied.
- ///
- /// HTML has newlines and spaces added, rather then condensed.
- #[test]
- fn chapter_test_tidy() {
- let html = String::from(
- r#"<div>
- <a href="/bible/59/GEN.2.ESV">
- <div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5">
- <svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25">
- <title id="Next Chapter">Next Chapter</title>
- <path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor">
- </path>
- </svg>
- </div>
- </a>
- </div>"#,
- );
- let r = find_next_chapter(&html);
- if !r.is_ok() {
- println!("DEBUG result = {:?}", r);
- }
- assert!(r.is_ok());
- let link = r.unwrap();
- assert_eq!(link, "/bible/59/GEN.2.ESV");
- }
- }
|