Browse Source

Updated with url crate.

This allows for proper relative URL handlying, via join.
Steve Thielemann 1 month ago
parent
commit
85e66f936d
5 changed files with 52 additions and 22 deletions
  1. 1 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 4 0
      src/fetch.rs
  4. 1 1
      src/main.rs
  5. 45 21
      src/parse.rs

+ 1 - 0
Cargo.lock

@@ -1676,6 +1676,7 @@ dependencies = [
  "clap",
  "reqwest",
  "scraper",
+ "url",
 ]
 
 [[package]]

+ 1 - 0
Cargo.toml

@@ -8,3 +8,4 @@ anyhow = "1.0.95"
 clap = { version = "4.5.27", features = ["derive"] }
 reqwest = { version = "0.12.12", features = ["blocking"] }
 scraper = "0.22.0"
+url = "2.5.4"

+ 4 - 0
src/fetch.rs

@@ -6,17 +6,21 @@ pub struct FetchResult {
     pub html: String,
 }
 
+#[allow(dead_code)]
+// Should this always fetch it/save?
 pub fn fetch(client: &reqwest::blocking::Client, url: &str) -> Result<String> {
     let res = client.get(url).send()?;
     let buffer = res.text()?;
     Ok(buffer)
 }
 
+/// Extract filename from the end of a URL.
 pub fn filename_from_url(url: &str) -> Result<String> {
     let (_, filename) = url.rsplit_once('/').context("Failed to split URL.")?;
     Ok(filename.to_string())
 }
 
+// Option to always fetch, ignore cache?
 pub fn fetch_cache(
     work_dir: &str,
     client: &reqwest::blocking::Client,

+ 1 - 1
src/main.rs

@@ -182,7 +182,7 @@ fn main() -> Result<()> {
                     url.as_str(),
                 )?;
 
-                let next_chapter = parse::find_next_chapter(&result.html);
+                let next_chapter = parse::find_next_chapter(&result.html, &url);
 
                 if let Ok(next_url) = next_chapter {
                     // Ok!  We have something

+ 45 - 21
src/parse.rs

@@ -1,7 +1,17 @@
 use anyhow::{bail, Result};
 use scraper;
 use scraper::Element;
+use url::Url;
 
+/// Convert relate to absolute
+fn relative_to_absolute(url: &str, href: &str) -> Result<String> {
+    let base_url = Url::parse(url)?;
+    let new_url = base_url.join(href)?;
+    Ok(new_url.to_string())
+}
+
+#[allow(dead_code)]
+/// Find next sibling element.
 fn next_element(element: scraper::ElementRef<'_>) -> Result<scraper::ElementRef<'_>> {
     let next_node = element.next_sibling_element();
 
@@ -18,13 +28,10 @@ pub struct VerseOfDay {
 }
 
 fn element_text(element: scraper::ElementRef<'_>) -> String {
-    let text = element.text()
-        .map(|s| {
-            s.trim_matches(char::is_whitespace)
-        })
-        .filter(|x| {
-            !x.is_empty()
-        })
+    let text = element
+        .text()
+        .map(|s| s.trim_matches(char::is_whitespace))
+        .filter(|x| !x.is_empty())
         .collect::<String>();
     text
 }
@@ -40,7 +47,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
         let vod_div_select = scraper::Selector::parse("main>div>div>div>div").unwrap();
         if let Some(vod_div) = document.select(&vod_div_select).next() {
             // Ok, search just in this div for things of interest.
-            /* 
+            /*
             // h1 text is "Verse of the Day"
             let h1_select = scraper::Selector::parse("h1").unwrap();
             let h1 = vod_div.select(&h1_select).next().unwrap();
@@ -51,12 +58,21 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
             // println!("p = {}", element_text(p)); // p.text().collect::<Vec<_>>());
 
             let a_select = scraper::Selector::parse(r#"div[class~="mbs-2"]>a"#).unwrap();
-            let mut verse_info = vod_div.select(&a_select).map(|a| element_text(a)).collect::<Vec<String>>();
+            let mut verse_info = vod_div
+                .select(&a_select)
+                .map(|a| element_text(a))
+                .collect::<Vec<String>>();
 
             if verse_info.len() == 2 {
-                result.push(VerseOfDay{date: element_text(p), verse:verse_info.remove(0), reference:verse_info.remove(0)});
+                result.push(VerseOfDay {
+                    date: element_text(p),
+                    verse: verse_info.remove(0),
+                    reference: verse_info.remove(0),
+                });
+            } else {
+                bail!("Unable to locate today's verse.  Has the HTML changed?");
             }
-            /* 
+            /*
             for a in vod_div.select(&a_select) {
                 println!("a = {}", element_text(a)); // a.text().collect::<Vec<_>>());
             }
@@ -64,7 +80,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
         }
     }
 
-    /* 
+    /*
     // Verse of the day is div with two a's.
     let h1_selector = scraper::Selector::parse("div>h1").unwrap();
     let h1 = document.select(&h1_selector).next().unwrap();
@@ -88,14 +104,21 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
 
     for prev_div in document.select(&prev_div_selector) {
         if let Some(p) = prev_div.select(&p_selector).next() {
-            let mut verse_info = prev_div.select(&a_selector1).map(|a| element_text(a)).collect::<Vec<String>>();
+            let mut verse_info = prev_div
+                .select(&a_selector1)
+                .map(|a| element_text(a))
+                .collect::<Vec<String>>();
             if verse_info.len() == 2 {
-                result.push(VerseOfDay{date: element_text(p), verse: verse_info.remove(0), reference: verse_info.remove(0)});
+                result.push(VerseOfDay {
+                    date: element_text(p),
+                    verse: verse_info.remove(0),
+                    reference: verse_info.remove(0),
+                });
             }
             // println!("{}", element_text(p)); // p.text().collect::<Vec<_>>());
         }
 
-        /* 
+        /*
         for a in prev_div.select(&a_selector1) {
             if let Some(href) = a.attr("href") {
                 // let text = a.text().collect::<Vec<_>>();
@@ -124,7 +147,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
     // bail!("More dERP!");
 }
 
-pub fn find_next_chapter(html: &String) -> Result<String> {
+pub fn find_next_chapter(html: &String, url: &str) -> Result<String> {
     let document = scraper::Html::parse_document(html);
     // let a_selector = scraper::Selector::parse("div>a").unwrap();
     // This one works:  (starts with "/bible/").
@@ -144,6 +167,7 @@ pub fn find_next_chapter(html: &String) -> Result<String> {
 
         if let Some(href) = a.attr("href") {
             //    if href.contains("/bible/") {
+            let href_absolute = relative_to_absolute(url, href)?;
 
             let text = a
                 .text()
@@ -164,7 +188,7 @@ pub fn find_next_chapter(html: &String) -> Result<String> {
                 // println!("Found: [{:?}]", text[0]);
                 continue;
             }
-            return Ok(href.to_string());
+            return Ok(href_absolute);
 
             //    } else {
             //        println!("href contains: [{}]", href);
@@ -184,13 +208,13 @@ mod tests {
         let html = String::from(
             r#"<div class="[pointer-events:all]"><a href="/bible/59/GEN.2.ESV"><div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5"><svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25"><title id="Next Chapter">Next Chapter</title><path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor"></path></svg></div></a></div>"#,
         );
-        let r = find_next_chapter(&html);
+        let r = find_next_chapter(&html, "https://bible.com/bible/link1");
         if !r.is_ok() {
             println!("DEBUG result = {:?}", r);
         }
         assert!(r.is_ok());
         let link = r.unwrap();
-        assert_eq!(link, "/bible/59/GEN.2.ESV");
+        assert_eq!(link, "https://bible.com/bible/59/GEN.2.ESV");
     }
 
     /// This tests when the HTML has been tidied.
@@ -211,12 +235,12 @@ mod tests {
         </a>
         </div>"#,
         );
-        let r = find_next_chapter(&html);
+        let r = find_next_chapter(&html, "https://bible.com/bible/link1");
         if !r.is_ok() {
             println!("DEBUG result = {:?}", r);
         }
         assert!(r.is_ok());
         let link = r.unwrap();
-        assert_eq!(link, "/bible/59/GEN.2.ESV");
+        assert_eq!(link, "https://bible.com/bible/59/GEN.2.ESV");
     }
 }