Explorar el Código

Added clap. Added more Bible versions.

Steve Thielemann hace 1 mes
padre
commit
0dddafd4cc
Se han modificado 3 ficheros con 435 adiciones y 24 borrados
  1. 121 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 313 24
      src/main.rs

+ 121 - 0
Cargo.lock

@@ -17,6 +17,56 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
 
+[[package]]
+name = "anstream"
+version = "0.6.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
+dependencies = [
+ "anstyle",
+ "once_cell",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "atomic-waker"
 version = "1.1.2"
@@ -89,6 +139,52 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "clap"
+version = "4.5.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -364,6 +460,12 @@ version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "html5ever"
 version = "0.29.0"
@@ -645,6 +747,12 @@ version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itoa"
 version = "1.0.14"
@@ -1355,6 +1463,12 @@ dependencies = [
  "quote",
 ]
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "subtle"
 version = "2.6.1"
@@ -1552,6 +1666,7 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 name = "tutorial"
 version = "0.1.0"
 dependencies = [
+ "clap",
  "reqwest",
  "scraper",
 ]
@@ -1603,6 +1718,12 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
 
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"

+ 1 - 0
Cargo.toml

@@ -4,5 +4,6 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
+clap = { version = "4.5.27", features = ["derive"] }
 reqwest = { version = "0.12.12", features = ["blocking"] }
 scraper = "0.22.0"

+ 313 - 24
src/main.rs

@@ -1,45 +1,334 @@
 use reqwest;
-use std::{fs::File, path::Path, io::Read};
 
-use scraper::{Html, Selector};
+// use std::io::Write;
+// use std::io::Read;
+
+use scraper;
+use std::{
+    collections::HashMap,
+    fs::File,
+    io::Write,
+    path::{Path, PathBuf},
+    sync::LazyLock,
+};
+
+use clap::{Parser, Subcommand};
+use std::{thread, time::Duration};
+
+#[derive(Parser)]
+#[command(about, long_about=None)]
+struct Cli {
+    /// Working directory
+    #[arg(short, long, default_value = "bible")]
+    work: PathBuf,
+
+    /// Bible Version
+    #[arg(short, long, default_value = "ESV")]
+    bible: String,
+
+    #[command(subcommand)]
+    command: Option<Commands>,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    /// Fetch from the web, using work directory for cache
+    Fetch {
+        /// Delay
+        #[arg(short, long, default_value = "10")]
+        delay: u32,
+    },
+    /// Extract information from cached files
+    Extract {},
+    /// Test something out
+    Test {},
+}
 
 static APP_USER_AGENT: &str =
     "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
-static URL: &str = "https://www.bible.com/bible/59/GEN.1.ESV";
+
+static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
+    HashMap::from([
+        ("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
+        ("KJV", "https://www.bible.com/bible/1/GEN.1.KJV"),
+        ("NIV", "https://www.bible.com/bible/111/GEN.1.NIV"),
+        ("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
+    ])
+});
+
+// Start here
+// static URL: &str = "https://www.bible.com/bible/59/PSA.95.ESV";
+// "https://www.bible.com/bible/59/GEN.1.ESV";
+
+// And maybe:
+// https://www.bible.com/bible/2692/GEN.1.NASB2020
+// https://www.bible.com/bible/1/GEN.1.KJV
+// https://www.bible.com/bible/2692/GEN.1.NASB2020
+// https://www.bible.com/bible/111/GEN.1.NIV
+// https://www.bible.com/bible/821/GEN.1.YLT98
+// Catholic
+// https://www.bible.com/bible/42/GEN.1.CPDV
+
+// Audio
+// https://www.bible.com/audio-bible/59/GEN.1.ESV
+// <script type="application/ld+json">{"@context":"https://schema.org","@type":"AudioObject","mainEntityOfPage":{"@type":"WebPage","@id":"https://www.bible.com/audio-bible/59/GEN.1.ESV"},"headline":"Audio Bible: Listen to Genesis 1 English Standard Version 2016 ESV","contentUrl":"https://audio-bible-cdn.youversionapi.com/1/32k/GEN/1-9dcefc68c6f7244489f59475fc7a1866.mp3?version_id=59",
+
+// https://www.bible.com/verse-of-the-day
+
+struct FetchResult {
+    cached: bool,
+    html: String,
+}
+
+fn fetch_cache(client: &reqwest::blocking::Client, url: &str) -> FetchResult {
+    let (_, filename) = url.rsplit_once('/').unwrap();
+    let path = Path::new("bible").join(filename);
+
+    if path.exists() {
+        // File already exists -- use cached version.
+        let buffer = std::fs::read_to_string(path).unwrap();
+        return FetchResult {
+            cached: true,
+            html: buffer,
+        };
+    }
+
+    println!("fetch_cache {} => {}", url, filename);
+    let res = client.get(url).send().unwrap();
+    let buffer = res.text().unwrap();
+
+    let mut file = File::create(path).unwrap();
+    let _ = file.write_all(buffer.as_bytes());
+    FetchResult {
+        cached: false,
+        html: buffer,
+    }
+}
 
 fn main() {
+    let cli = Cli::parse();
+    // println!("Work Dir: {:?}", cli.work);
+    // println!("Bible: {:?}", cli.bible);
+
+    if !VERSION_URLS.contains_key(cli.bible.as_str()) {
+        println!("Sorry, I don't know about Bible [{}].", cli.bible);
+        println!("I do know about the following:");
+
+        // Keys sorted in order.
+        for (name, _) in VERSION_URLS.iter() {
+            println!("  {}", name);
+        }
+        return;
+    }
+   
+    match &cli.command {
+        Some(Commands::Fetch { delay }) => {
+            let client = reqwest::blocking::Client::builder()
+            .user_agent(APP_USER_AGENT)
+            .build()
+            .unwrap();
+            let mut url = VERSION_URLS[cli.bible.as_str()].to_string();
+            println!("Fetch! [{}] with delay {} secs.", cli.bible, delay);
+            let mut more = true;
+
+            while more {
+                let result = fetch_cache(&client, url.as_str());
+                more = false;
+
+                let document = scraper::Html::parse_document(&result.html);
+
+                // TO FIX
+                // We want to upgrade this to use CSS selectors.
+                // For now, us the "working" code we have.
+                let a_selector = scraper::Selector::parse("div>a").unwrap();
+                for a in document.select(&a_selector) {
+
+                    // Skip elements with a class attribute
+                    if a.attr("class").is_some() {
+                        continue;
+                    }
+
+                    if let Some(href) = a.attr("href") {
+                        if href.contains("/bible/") {
+                            let text = a.text().collect::<Vec<_>>();
+
+                            if text.len() != 1 {
+                                continue;
+                            }
+                            if text[0] != "Next Chapter" {
+                                continue;
+                            }
+
+                            // Ok! We've found the Next Chapter a element!
+                            if href.starts_with("/") {
+                                url = String::from("https://www.bible.com") + href;
+                            } else {
+                                url = href.to_string();
+                            }
+
+                            // println!("Found HREF: {} => {}", href, url);
+                            // panic!("Squirrel alert!");
+                            more = true;
+                            break
+                        }
+                    }
+                }
+
+                if more {
+                    if !result.cached {
+                        thread::sleep(Duration::from_secs(*delay as u64));
+                    }
+                }
+            }
+            println!("I'm finished fetching!");
+        }
+        Some(Commands::Extract {}) => {
+            println!("Extract...");
+        }
+        Some(Commands::Test {}) => {
+            println!("Testing...");
+        }
+        None => {
+            println!("Looking for FETCH or EXTRACT");
+            println!("I've got nothing to do here...");
+        }
+    }
+
+    /*
+    return;
+
+    
     let client = reqwest::blocking::Client::builder()
         .user_agent(APP_USER_AGENT)
         .build()
         .unwrap();
 
-    // let res = client.get("https://httpbin.org/anything").send().unwrap();
-    // println!("anything: {}", res.text().unwrap());
+    if true {
+        let mut url = String::from(URL);
+        let mut previous_url = url.clone();
+        let mut working = true;
 
-    let mut file = File::open(Path::new("fetch1.html")).unwrap();
-    let mut buffer = String::new();
-    let _ = file.read_to_string(&mut buffer);
-    drop(file);
+        while working {
+            // Begin the fetching process...
+            let result = fetch_cache(&client, url.as_str());
 
-    /* 
-    let res = client.get(URL).send().unwrap();
-    let buffer = res.text().unwrap();
-    println!("{}", res.text().unwrap());
-    */
+            working = false;
+
+            // Ok, HTML, get parsing!
+            let document = scraper::Html::parse_document(&result.html);
+
+            // For now, I don't care about parsing, just crawling/saving.
+            // Locate the a href.
+            let a_selector = scraper::Selector::parse("div>a").unwrap();
+            for a in document.select(&a_selector) {
+                let c = a.attr("class");
+                if c.is_some() {
+                    continue;
+                }
+
+                let href = a.attr("href");
 
-    let document = scraper::Html::parse_document(&buffer);
-    let h1_selector = scraper::Selector::parse("h1").unwrap();
+                if href.is_some() {
+                    let href = href.unwrap();
+                    if href.contains("/bible/") {
+                        let text = a.text().collect::<Vec<_>>();
 
-    let h1 = document.select(&h1_selector).next().unwrap();
-    println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
+                        if text.len() != 1 {
+                            continue;
+                        }
+                        if text[0] != "Next Chapter" {
+                            continue;
+                        }
 
-    let span_selector = scraper::Selector::parse("span").unwrap();
-    for span in document.select(&span_selector) {
-        if span.attr("data-usfm").is_some() {
-            println!("span: {:?}", span.text().collect::<Vec<_>>());
+                        // previous_url = url;
+                        url = href.to_string();
+                        if url.starts_with("/") {
+                            url = String::from("https://www.bible.com") + &url;
+                        }
+
+                        working = true;
+                        break;
+                    }
+                }
+            }
+            if working {
+                if !result.cached {
+                    // Don't sleep if the results we just got were from the cache.
+                    thread::sleep(Duration::from_secs(10));
+                }
+            }
+        }
+        println!("I'm finished.");
+    } else {
+        // let res = client.get("https://httpbin.org/anything").send().unwrap();
+        // println!("anything: {}", res.text().unwrap());
+
+        let mut file = File::open(Path::new("fetch1.html")).unwrap();
+        let mut buffer = String::new();
+        let _ = file.read_to_string(&mut buffer);
+        drop(file);
+
+        /*
+        let res = client.get(URL).send().unwrap();
+        let buffer = res.text().unwrap();
+        println!("{}", res.text().unwrap());
+        */
+
+        let document = scraper::Html::parse_document(&buffer);
+        let h1_selector = scraper::Selector::parse("h1").unwrap();
+
+        let h1 = document.select(&h1_selector).next().unwrap();
+        println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
+
+        // Selectors can't match on "if it exists" or
+        // return a matching value, type fields.
+
+        let span_selector = scraper::Selector::parse("span").unwrap();
+        for span in document.select(&span_selector) {
+            let data = span.attr("data-usfm");
+
+            if data.is_some() {
+                // Ok, iterate over these items...
+                for data_span in span.select(&span_selector) {
+                    let d = data_span.attr("class");
+                    // println!("data_span: {:?}", data_span);
+                    if d.is_some() {
+                        let dt = d.unwrap();
+                        if dt.contains("content") {
+                            let text = data_span.text().collect::<Vec<_>>();
+                            println!("{} {:?}", data.unwrap(), text)
+                        }
+                    }
+                }
+
+                /*
+                GEN.1.25 ["And God made the beasts of the earth according to their kinds and the livestock according to their kinds, and everything that creeps on the ground according to its kind. And God saw that it was good."]
+                GEN.1.25 ["  "]
+                GEN.1.26 ["Then God said, "]
+                GEN.1.26 ["“Let us make man"]
+                GEN.1.26 [" in our image, "]
+                GEN.1.26 ["after our likeness. And "]
+                GEN.1.26 ["let them have dominion over the fish of the sea and over the birds of the heavens and over the livestock and over all the earth and over every creeping thing that creeps on the earth.”"]
+                GEN.1.26 ["  "]
+                GEN.1.27 ["So God created man in his own image,"]
+                GEN.1.27 ["in the image of God he created him;"]
+                GEN.1.27 ["  "]
+                GEN.1.27 ["male and female he created them."]
+                GEN.1.27 ["  "]
+                 */
+
+                /*
+                let mut text = span.text().collect::<Vec<_>>();
+
+                println!("span: {} {:?}", data.unwrap(), text);
+                */
+            }
         }
     }
+
     // let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
+    */
+
 }
 
 /*
@@ -89,11 +378,11 @@ Book/chapter: <h1>
                             form and void, and darkness was over
                             the face of the deep. And the Spirit of
                             God was hovering over the face of the
-                            waters.</span></span>                         
+                            waters.</span></span>
 Next page link:
 
             <div class="[pointer-events:all]">
               <a href="/bible/59/GEN.2.ESV">
 
 
-*/
+*/