Browse Source

Getting closer on working VOD.

Steve Thielemann 1 month ago
parent
commit
a9f855f149
3 changed files with 124 additions and 61 deletions
  1. 16 4
      src/fetch.rs
  2. 42 54
      src/main.rs
  3. 66 3
      src/parse.rs

+ 16 - 4
src/fetch.rs

@@ -6,13 +6,25 @@ pub struct FetchResult {
     pub html: String,
 }
 
+pub fn fetch(client: &reqwest::blocking::Client, url: &str) -> Result<String> {
+    let res = client.get(url).send()?;
+    let buffer = res.text()?;
+    Ok(buffer)
+}
+
+pub fn filename_from_url(url: &str) -> Result<String> {
+    let (_, filename) = url.rsplit_once('/').context("Failed to split URL.")?;
+    Ok(filename.to_string())
+}
+
 pub fn fetch_cache(
     work_dir: &str,
     client: &reqwest::blocking::Client,
     url: &str,
 ) -> Result<FetchResult> {
-    let (_, filename) = url.rsplit_once('/').context("Failed to split URL.")?;
-    let path = Path::new(work_dir).join(filename);
+    let filename = filename_from_url(url)?;
+    // let (_, filename) = url.rsplit_once('/').context("Failed to split URL.")?;
+    let path = Path::new(work_dir).join(&filename);
 
     if path.exists() {
         // File already exists -- use cached version.
@@ -29,8 +41,8 @@ pub fn fetch_cache(
     }
 
     println!("fetch_cache {} => {}", url, filename);
-    let res = client.get(url).send().unwrap();
-    let buffer = res.text().unwrap();
+    let res = client.get(url).send()?;
+    let buffer = res.text()?;
 
     let mut file =
         File::create(&path)

+ 42 - 54
src/main.rs

@@ -16,7 +16,7 @@ mod parse;
 // Setup the command line options
 
 #[derive(Parser)]
-#[command(about, long_about=None)]
+#[command(about = "Downloads and parses Bible verses from https://www.bible.com", long_about=None, arg_required_else_help = true, after_help = "This is very specific to the website's HTML.\nIf it changes, this program might no longer work.")]
 struct Cli {
     /// Working directory
     #[arg(short, long, default_value = "bible")]
@@ -48,6 +48,12 @@ enum Commands {
         #[arg(short, long, action=clap::ArgAction::SetTrue)]
         all: bool,
     },
+    /// Verse of the day
+    Verse {
+        /// Fetch new version
+        #[arg(short, long, action=clap::ArgAction::SetTrue)]
+        fetch: bool,
+    },
     /// Test something out
     Test {},
 }
@@ -56,6 +62,7 @@ static APP_USER_AGENT: &str =
     "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
 
 static BASE_URL: &str = "https://www.bible.com";
+static VOD_URL: &str = "https://www.bible.com/verse-of-the-day";
 
 static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
     HashMap::from([
@@ -163,6 +170,7 @@ fn main() -> Result<()> {
             let mut url = VERSION_URLS[cli.version.as_str()].to_string();
             println!("Fetch! [{}] with delay {} secs.", cli.version, delay);
             let mut more = true;
+            let mut cache_hit_once = true;
 
             while more {
                 let result = fetch::fetch_cache(
@@ -186,52 +194,20 @@ fn main() -> Result<()> {
                         url = next_url.to_string();
                     }
                 } else {
+                    // We didn't find the Next Chapter link, so stop.
                     more = false;
                 }
 
-                /*
-                let document = scraper::Html::parse_document(&result.html);
-
-                // TO FIX
-                // We want to upgrade this to use CSS selectors.
-                // For now, us the "working" code we have.
-                let a_selector = scraper::Selector::parse("div>a").unwrap();
-                for a in document.select(&a_selector) {
-                    // Skip elements with a class attribute
-                    if a.attr("class").is_some() {
-                        continue;
-                    }
-
-                    if let Some(href) = a.attr("href") {
-                        if href.contains("/bible/") {
-                            let text = a.text().collect::<Vec<_>>();
-
-                            if text.len() != 1 {
-                                continue;
-                            }
-                            if text[0] != "Next Chapter" {
-                                continue;
-                            }
-
-                            // Ok! We've found the Next Chapter a element!
-                            if href.starts_with("/") {
-                                url = String::from(BASE_URL) + href;
-                            } else {
-                                url = href.to_string();
-                            }
-
-                            // println!("Found HREF: {} => {}", href, url);
-                            // panic!("Squirrel alert!");
-                            more = true;
-                            break;
-                        }
-                    }
-                }
-                */
-
+                // If there's more to do, add a delay between requests.
                 if more {
                     if !result.cached {
                         thread::sleep(Duration::from_secs(*delay as u64));
+                    } else {
+                        if cache_hit_once {
+                            // Display this message only once.
+                            println!("Using CACHE.");
+                            cache_hit_once = false;
+                        }
                     }
                 }
             }
@@ -337,19 +313,31 @@ fn main() -> Result<()> {
             "AMO.8.9": "“And on that day,” declares the Lord God,\n“I will make the sun go down at noonand darken the earth in broad daylight.\n"}
             ^ noonand ?  Shouldn't that be "noon and"?  Check original. Original has a break between them.  Check merge routine.
              */
-            /*
-            // for file in files.iter().take(*count as usize) {
-            for file in files_iter {
-                /* ESV
-                JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
-                JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
-                JHN.8.11 ["sin no more.”"]
-                JHN.8.11 ["]]"]   <- What is this?  It is the the original HTML.
-                JHN.8.11 ["  "]
-                */
-            }
+            /* ESV
+            JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
+            JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
+            JHN.8.11 ["sin no more.”"]
+            JHN.8.11 ["]]"]   <- What is this?  It is the the original HTML.
+            JHN.8.11 ["  "]
             */
         }
+
+        Some(Commands::Verse{ fetch}) => {
+            let client = reqwest::blocking::Client::builder()
+            .user_agent(APP_USER_AGENT)
+            .build()?;
+
+            println!("Verse of the day.");
+            let result = fetch::fetch_cache(cli.work
+                .as_os_str()
+                .to_str()
+                .expect("Work should be valid."), &client, VOD_URL)?;
+            if result.cached {
+                println!("(from cache):");
+            }
+            let _v = parse::find_vod(&result.html);
+
+        }
         Some(Commands::Test {}) => {
             println!("Testing...");
 
@@ -396,8 +384,8 @@ fn main() -> Result<()> {
             */
         }
         None => {
-            println!("Looking for FETCH or EXTRACT");
-            println!("I've got nothing to do here...");
+            println!("I didn't see a command.  Displaying help.\n");
+            let _show_help : Cli = Cli::parse_from(["--help"]);
         }
     }
 

+ 66 - 3
src/parse.rs

@@ -1,5 +1,67 @@
 use anyhow::{bail, Result};
 use scraper;
+use scraper::Element;
+
+fn next_element(element: scraper::ElementRef<'_>) -> Result<scraper::ElementRef<'_>> {
+    let next_node = element.next_sibling_element();
+
+    if let Some(node) = next_node {
+        return Ok(node);
+    }
+    bail!("No more elements.");
+}
+
+pub fn find_vod(html: &String) -> Result<String> {
+    let document = scraper::Html::parse_document(&html);
+    // let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap();
+    let mut result: Vec<(String, String)> = Vec::new();
+
+    // Verse of the day is div with two a's.
+    let h1_selector = scraper::Selector::parse("div>h1").unwrap();
+    let h1 = document.select(&h1_selector).next().unwrap();
+    println!("{:?}", h1.text().collect::<Vec<_>>());
+    let a = next_element(h1).unwrap();
+    println!("a : {}", a.html());
+    /*
+    let a = next_element(a).unwrap();
+    println!("a : {}", a.html());
+    */
+    // Previous ones are in div[class="mlb-2"]
+
+    let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap();
+    let a_selector1 =
+        scraper::Selector::parse(r#"a[href^="/bible/"][class~="no-underline"]"#).unwrap();
+    let p_selector = scraper::Selector::parse("div>p").unwrap();
+
+    for prev_div in document.select(&prev_div_selector) {
+        if let Some(p) = prev_div.select(&p_selector).next() {
+            println!("{:?}", p.text().collect::<Vec<_>>());
+        }
+        
+        let mut last_verse = String::new();
+        for a in prev_div.select(&a_selector1) {
+            if let Some(href) = a.attr("href") {
+                let text = a.text().collect::<Vec<_>>();
+                println!("{:?}", text);
+                // println!("html: {}", a.html());
+            };
+        }
+        println!("-----");
+    }
+
+    println!("And finally...");
+    let a_selector =
+        scraper::Selector::parse(r#"div>a[href^="/bible/"][class~="no-underline"]"#).unwrap();
+    let mut last_verse = String::new();
+    for a in document.select(&a_selector) {
+        if let Some(href) = a.attr("href") {
+            let text = a.text().collect::<Vec<_>>();
+            println!("{:?}", text);
+            println!("html: {}", a.html());
+        };
+    }
+    bail!("More dERP!");
+}
 
 pub fn find_next_chapter(html: &String) -> Result<String> {
     let document = scraper::Html::parse_document(html);
@@ -58,7 +120,9 @@ mod tests {
     /// Test HTML as given to us by the website.
     #[test]
     fn chapter_test() {
-        let html = String::from(r#"<div class="[pointer-events:all]"><a href="/bible/59/GEN.2.ESV"><div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5"><svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25"><title id="Next Chapter">Next Chapter</title><path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor"></path></svg></div></a></div>"#);
+        let html = String::from(
+            r#"<div class="[pointer-events:all]"><a href="/bible/59/GEN.2.ESV"><div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5"><svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25"><title id="Next Chapter">Next Chapter</title><path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor"></path></svg></div></a></div>"#,
+        );
         let r = find_next_chapter(&html);
         if !r.is_ok() {
             println!("DEBUG result = {:?}", r);
@@ -69,7 +133,7 @@ mod tests {
     }
 
     /// This tests when the HTML has been tidied.
-    /// 
+    ///
     /// HTML has newlines and spaces added, rather then condensed.
     #[test]
     fn chapter_test_tidy() {
@@ -94,5 +158,4 @@ mod tests {
         let link = r.unwrap();
         assert_eq!(link, "/bible/59/GEN.2.ESV");
     }
-
 }