Browse Source

Working on collecting verses.

Steve Thielemann 1 month ago
parent
commit
f2102b5b99
1 changed files with 95 additions and 10 deletions
  1. 95 10
      src/main.rs

+ 95 - 10
src/main.rs

@@ -9,6 +9,7 @@ use std::{
     fs::File,
     io::Write,
     path::{Path, PathBuf},
+    string::String,
     sync::LazyLock,
 };
 
@@ -47,15 +48,46 @@ enum Commands {
 static APP_USER_AGENT: &str =
     "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
 
+static BASE_URL: &str = "https://www.bible.com";
+
 static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
     HashMap::from([
         ("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
         ("KJV", "https://www.bible.com/bible/1/GEN.1.KJV"),
-        ("NIV", "https://www.bible.com/bible/111/GEN.1.NIV"),
+        ("NIV", "https://www.bible.com/bible/111/GEN.INTRO1.NIV"),
+        // https://www.bible.com/bible/111/GEN.1.NIV"),
         ("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
     ])
 });
 
+static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
+    Vec::from([
+        "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT", "1SA", "2SA", "1KI", "2KI", "1CH",
+        "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER", "LAM", "EZK",
+        "DAN", "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL",
+        "MAT", "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", "COL", "1TH",
+        "2TH", "1TI", "2TI", "TIT", "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD",
+        "REV",
+    ])
+});
+
+// find_files in base_dir that end with extension bible.
+fn find_files(base_dir: &str, bible: &str) -> Vec<String> {
+    let paths = std::fs::read_dir(base_dir).unwrap();
+    let mut result = Vec::<String>::new();
+
+    for path in paths {
+        if let Ok(dir) = path {
+            let filename = dir.file_name().to_string_lossy().to_string();
+            if filename.ends_with(bible) {
+                // result.push(dir.file_name().to_string_lossy().to_string());
+                result.push(dir.path().to_string_lossy().to_string());
+            }
+        }
+    }
+    result
+}
+
 // Start here
 // static URL: &str = "https://www.bible.com/bible/59/PSA.95.ESV";
 // "https://www.bible.com/bible/59/GEN.1.ESV";
@@ -120,13 +152,13 @@ fn main() {
         }
         return;
     }
-   
+
     match &cli.command {
         Some(Commands::Fetch { delay }) => {
             let client = reqwest::blocking::Client::builder()
-            .user_agent(APP_USER_AGENT)
-            .build()
-            .unwrap();
+                .user_agent(APP_USER_AGENT)
+                .build()
+                .unwrap();
             let mut url = VERSION_URLS[cli.bible.as_str()].to_string();
             println!("Fetch! [{}] with delay {} secs.", cli.bible, delay);
             let mut more = true;
@@ -142,7 +174,6 @@ fn main() {
                 // For now, us the "working" code we have.
                 let a_selector = scraper::Selector::parse("div>a").unwrap();
                 for a in document.select(&a_selector) {
-
                     // Skip elements with a class attribute
                     if a.attr("class").is_some() {
                         continue;
@@ -161,7 +192,7 @@ fn main() {
 
                             // Ok! We've found the Next Chapter a element!
                             if href.starts_with("/") {
-                                url = String::from("https://www.bible.com") + href;
+                                url = String::from(BASE_URL) + href;
                             } else {
                                 url = href.to_string();
                             }
@@ -169,7 +200,7 @@ fn main() {
                             // println!("Found HREF: {} => {}", href, url);
                             // panic!("Squirrel alert!");
                             more = true;
-                            break
+                            break;
                         }
                     }
                 }
@@ -184,9 +215,64 @@ fn main() {
         }
         Some(Commands::Extract {}) => {
             println!("Extract...");
+            let files = find_files(cli.work.to_str().unwrap(), cli.bible.as_str());
+            for file in files.iter().take(5) {
+                println!("File: {}", file);
+                let buffer = std::fs::read_to_string(Path::new(file)).unwrap();
+                let document = scraper::Html::parse_document(&buffer);
+
+                let h1_selector = scraper::Selector::parse("h1").unwrap();
+                let h1 = document.select(&h1_selector).next().unwrap();
+                println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
+
+                let span_selector = scraper::Selector::parse("span").unwrap();
+                for span in document.select(&span_selector) {
+                    if let Some(data) = span.attr("data-usfm") {
+                        println!("{}:", data);
+                        let mut lines = Vec::<&str>::new();
+                        for data_span in span.select(&span_selector) {
+                            if let Some(data_class) = data_span.attr("class") {
+                                if data_class.contains("content") {
+                                    let mut text = data_span.text().collect::<Vec<_>>();
+                                    println!("{} {:?}", data, text);
+                                    lines.append(&mut text);
+                                }
+                            }
+                        }
+                        println!("{} {:?}", data, lines);
+                    }
+                }
+
+                /* ESV 
+                JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
+                JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
+                JHN.8.11 ["sin no more.”"]
+                JHN.8.11 ["]]"]   <- What is this?
+                JHN.8.11 ["  "]
+                */
+            }
         }
         Some(Commands::Test {}) => {
             println!("Testing...");
+            // Test finding div>a[href^="/bible/"]
+            let path = Path::new("bible").join("GEN.1.NIV");
+            let buffer = std::fs::read_to_string(path).unwrap();
+            let document = scraper::Html::parse_document(&buffer);
+
+            // let a_selector = scraper::Selector::parse("div>a").unwrap();
+            let a_selector = scraper::Selector::parse(r#"div>a[href ^="/bible/"]"#).unwrap();
+
+            // This reduces down the number of items to check (from ~40 to 4)!
+            // And invalidates the check for /bible/ !
+
+            for a in document.select(&a_selector) {
+                let text = a.text().collect::<Vec<_>>();
+                println!("text: {:?}", text);
+                if let Some(href) = a.attr("href") {
+                    println!("href = {}", href);
+                }
+                println!("=====");
+            }
         }
         None => {
             println!("Looking for FETCH or EXTRACT");
@@ -197,7 +283,7 @@ fn main() {
     /*
     return;
 
-    
+
     let client = reqwest::blocking::Client::builder()
         .user_agent(APP_USER_AGENT)
         .build()
@@ -328,7 +414,6 @@ fn main() {
 
     // let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
     */
-
 }
 
 /*