Browse Source

Added user agent updater.

Steve Thielemann 1 month ago
parent
commit
507c90360f
6 changed files with 189 additions and 41 deletions
  1. 41 0
      Cargo.lock
  2. 3 0
      Cargo.toml
  3. 11 0
      initial.config
  4. 24 1
      src/fetch.rs
  5. 81 27
      src/main.rs
  6. 29 13
      src/parse.rs

+ 41 - 0
Cargo.lock

@@ -17,6 +17,15 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
 
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "anstream"
 version = "0.6.18"
@@ -1133,6 +1142,35 @@ dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "regex"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+
 [[package]]
 name = "reqwest"
 version = "0.12.12"
@@ -1674,8 +1712,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "clap",
+ "regex",
  "reqwest",
  "scraper",
+ "serde",
+ "serde_json",
  "url",
 ]
 

+ 3 - 0
Cargo.toml

@@ -6,6 +6,9 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0.95"
 clap = { version = "4.5.27", features = ["derive"] }
+regex = "1.11.1"
 reqwest = { version = "0.12.12", features = ["blocking"] }
 scraper = "0.22.0"
+serde = { version = "1.0.217", features = ["derive"] }
+serde_json = "1.0.138"
 url = "2.5.4"

+ 11 - 0
initial.config

@@ -0,0 +1,11 @@
+{
+  "user_agent": "Mozilla/5.0 (X11; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0",
+  "versions": {
+    "NASB2020": "https://www.bible.com/bible/2692/GEN.1.NASB2020",
+    "MKJV": "https://www.bible.com/bible/114/GEN.1.NKJV",
+    "YLT98": "https://www.bible.com/bible/821/GEN.1.YLT98",
+    "KJV": "https://www.bible.com/bible/1/GEN.1.KJV",
+    "ESV": "https://www.bible.com/bible/59/GEN.1.ESV",
+    "NIV": "https://www.bible.com/bible/111/GEN.INTRO1.NIV"
+  }
+}

+ 24 - 1
src/fetch.rs

@@ -1,11 +1,34 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, bail};
 use std::{fs::File, io::Write, path::Path};
+use url::Url;
+use super::parse;
+
+/// Convert relate to absolute
+pub fn relative_to_absolute(url: &str, href: &str) -> Result<String> {
+    let base_url = Url::parse(url)?;
+    let new_url = base_url.join(href)?;
+    Ok(new_url.to_string())
+}
 
 pub struct FetchResult {
     pub cached: bool,
     pub html: String,
 }
 
+pub fn agent_update(user_agent: &str) -> Result<String> {
+    let client = reqwest::blocking::Client::builder()
+    .user_agent(user_agent)
+    .build()?;
+
+    let result = fetch(&client, "https://www.mozilla.org/en-US/firefox/releases/")?;
+    if let Ok(v) = parse::find_versions(&result) {
+        if v != user_agent {
+            return Ok(v);
+        }
+    }
+    bail!("No user_agent updates.");
+}
+
 #[allow(dead_code)]
 // Should this always fetch it/save?
 pub fn fetch(client: &reqwest::blocking::Client, url: &str) -> Result<String> {

+ 81 - 27
src/main.rs

@@ -10,6 +10,7 @@ use std::{
 };
 use std::{thread, time::Duration};
 
+mod config;
 mod fetch;
 mod parse;
 
@@ -26,6 +27,10 @@ struct Cli {
     #[arg(short, long, default_value = "ESV")]
     version: String,
 
+    /// User Agent
+    #[arg(short, long, action=clap::ArgAction::SetTrue)]
+    agent_update: bool,
+
     #[command(subcommand)]
     command: Option<Commands>,
 }
@@ -58,14 +63,19 @@ enum Commands {
     Test {},
 }
 
+const CONFIG_FILE : &str = "app.config";
+
+/*
 static APP_USER_AGENT: &str =
-    "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
+    "Mozilla/5.0 (X11; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0";
+*/
 
 // Not needed, I process relative URLs correctly now.
 // static BASE_URL: &str = "https://www.bible.com";
 
 static VOD_URL: &str = "https://www.bible.com/verse-of-the-day";
 
+/*
 static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
     HashMap::from([
         ("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
@@ -76,6 +86,7 @@ static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
         ("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
     ])
 });
+*/
 
 static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
     Vec::from([
@@ -148,10 +159,35 @@ fn find_files(base_dir: &str, version: &str) -> Vec<String> {
 // https://www.bible.com/verse-of-the-day
 
 fn main() -> Result<()> {
+    let mut config = config::read_config(CONFIG_FILE)?;
+
     let cli = Cli::parse();
     // println!("Work Dir: {:?}", cli.work);
     // println!("Bible: {:?}", cli.bible);
 
+    if !config.versions.contains_key(cli.version.as_str()) {
+        println!("Sorry, I don't know about Bible Version [{}].", cli.version);
+        println!("I do know about the following:");
+
+        // Keys in arbitrary order.
+        for (name, _) in config.versions.iter() {
+            println!("  {}", name);
+        }
+        return Ok(());
+    }
+
+    if cli.agent_update {
+        if let Ok(new_agent) = fetch::agent_update(&config.user_agent) {
+            config.user_agent = new_agent;
+            println!("User agent now {}", &config.user_agent);
+            config::write_config(CONFIG_FILE, &config)?;
+            return Ok(());
+        } else {
+            println!("User agent OK.");
+        }
+    }
+
+    /*
     if !VERSION_URLS.contains_key(cli.version.as_str()) {
         println!("Sorry, I don't know about Bible Version [{}].", cli.version);
         println!("I do know about the following:");
@@ -162,14 +198,15 @@ fn main() -> Result<()> {
         }
         return Ok(());
     }
+    */
 
     match &cli.command {
         Some(Commands::Fetch { delay }) => {
             let client = reqwest::blocking::Client::builder()
-                .user_agent(APP_USER_AGENT)
+                .user_agent(&config.user_agent)
                 .build()?;
             // .unwrap();
-            let mut url = VERSION_URLS[cli.version.as_str()].to_string();
+            let mut url = config.versions[cli.version.as_str()].to_string();
             println!("Fetch! [{}] with delay {} secs.", cli.version, delay);
             let mut more = true;
             let mut cache_hit_once = true;
@@ -184,20 +221,22 @@ fn main() -> Result<()> {
                     url.as_str(),
                 )?;
 
-                let next_chapter = parse::find_next_chapter(&result.html, &url);
+                let next_chapter = parse::find_next_chapter(&result.html);
 
                 if let Ok(next_url) = next_chapter {
                     // Ok!  We have something
                     // more = true;
 
-                    /* 
+                    /*
                     if next_url.starts_with("/") {
                         url = String::from(BASE_URL) + &next_url;
                     } else {
                         url = next_url.to_string();
                     }
                     */
-                    url = next_url;
+                    if let Ok(abs_url) = fetch::relative_to_absolute(&next_url, &url) {
+                        url = abs_url;
+                    }
                 } else {
                     // We didn't find the Next Chapter link, so stop.
                     more = false;
@@ -329,7 +368,7 @@ fn main() -> Result<()> {
 
         Some(Commands::Verse { fetch: _ }) => {
             let client = reqwest::blocking::Client::builder()
-                .user_agent(APP_USER_AGENT)
+                .user_agent(&config.user_agent)
                 .build()?;
 
             println!("Verse of the day.");
@@ -349,33 +388,48 @@ fn main() -> Result<()> {
                 println!("Verse: {}", v.verse);
                 println!("Ref:   {}", v.reference);
                 println!("------");
-            };
+            }
         }
         Some(Commands::Test {}) => {
             println!("Testing...");
+            let client = reqwest::blocking::Client::builder()
+                .user_agent(&config.user_agent)
+                .build()?;
+
+            // They are using react.  There's a token request, which allows them to fetch the daily reading...
+            let odb = fetch::fetch(&client, "https://www.odbm.org/");
+            // See the .har file for more details.
+
+            if let Ok(html) = odb {
+                println!("{}", html);
+            } else {
+                println!("Fetch error: {:?}", odb.unwrap_err());
+            }
+
+            if false {
+                let path = Path::new(&cli.work).join("GEN.1.NIV");
+                let buffer = std::fs::read_to_string(path).unwrap();
+                let document = scraper::Html::parse_document(&buffer);
+
+                let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
+                let _span_class = scraper::Selector::parse("span[class]").unwrap();
+                let span_selector = scraper::Selector::parse("span").unwrap();
 
-            let path = Path::new(&cli.work).join("GEN.1.NIV");
-            let buffer = std::fs::read_to_string(path).unwrap();
-            let document = scraper::Html::parse_document(&buffer);
-
-            let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
-            let _span_class = scraper::Selector::parse("span[class]").unwrap();
-            let span_selector = scraper::Selector::parse("span").unwrap();
-
-            for span in document.select(&span_data_usfm) {
-                if let Some(data) = span.attr("data-usfm") {
-                    println!("data-usfm {}:", data);
-                    let mut lines = Vec::<&str>::new();
-                    for data_span in span.select(&span_selector) {
-                        if let Some(data_class) = data_span.attr("class") {
-                            if data_class.contains("content") {
-                                let mut text = data_span.text().collect::<Vec<_>>();
-                                println!("{} {:?}", data, text);
-                                lines.append(&mut text);
+                for span in document.select(&span_data_usfm) {
+                    if let Some(data) = span.attr("data-usfm") {
+                        println!("data-usfm {}:", data);
+                        let mut lines = Vec::<&str>::new();
+                        for data_span in span.select(&span_selector) {
+                            if let Some(data_class) = data_span.attr("class") {
+                                if data_class.contains("content") {
+                                    let mut text = data_span.text().collect::<Vec<_>>();
+                                    println!("{} {:?}", data, text);
+                                    lines.append(&mut text);
+                                }
                             }
                         }
+                        println!("data {} lines {:?}", data, lines);
                     }
-                    println!("data {} lines {:?}", data, lines);
                 }
             }
             /*

+ 29 - 13
src/parse.rs

@@ -1,13 +1,20 @@
 use anyhow::{bail, Result};
 use scraper;
 use scraper::Element;
-use url::Url;
+use regex::Regex;
 
-/// Convert relate to absolute
-fn relative_to_absolute(url: &str, href: &str) -> Result<String> {
-    let base_url = Url::parse(url)?;
-    let new_url = base_url.join(href)?;
-    Ok(new_url.to_string())
+pub fn find_versions(html: &String) -> Result<String> {
+    let document = scraper::Html::parse_document(&html);
+    let select_a = scraper::Selector::parse("a").unwrap();
+    let version_match = Regex::new(r#"^[0-9.]+$"#).unwrap();
+    for a in document.select(&select_a) {
+        let text = element_text(a);
+
+        if version_match.is_match(&text) {
+            return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text));
+        }
+    }
+    bail!("Could not locate a version string.");
 }
 
 #[allow(dead_code)]
@@ -42,6 +49,15 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
     let mut result: Vec<VerseOfDay> = Vec::new();
 
     // How about this?
+    /*
+    This was build by looking at the structure of the HTML.
+    What I looked for, was, something that would contain all of the items I was
+    interested in.  Select it in the Web Developer tool. When everything you
+    want is highlighted in the browser page, that's the tag you want.
+    In this case, it was main div div div div. Tag p contained the date.
+    Tags a in a div[class="mbs-2"] had verse and reference.
+     */
+
     {
         // Locate the Verse of the Day div tag.
         let vod_div_select = scraper::Selector::parse("main>div>div>div>div").unwrap();
@@ -147,7 +163,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
     // bail!("More dERP!");
 }
 
-pub fn find_next_chapter(html: &String, url: &str) -> Result<String> {
+pub fn find_next_chapter(html: &String) -> Result<String> {
     let document = scraper::Html::parse_document(html);
     // let a_selector = scraper::Selector::parse("div>a").unwrap();
     // This one works:  (starts with "/bible/").
@@ -167,7 +183,7 @@ pub fn find_next_chapter(html: &String, url: &str) -> Result<String> {
 
         if let Some(href) = a.attr("href") {
             //    if href.contains("/bible/") {
-            let href_absolute = relative_to_absolute(url, href)?;
+            // let href_absolute = relative_to_absolute(url, href)?;
 
             let text = a
                 .text()
@@ -188,7 +204,7 @@ pub fn find_next_chapter(html: &String, url: &str) -> Result<String> {
                 // println!("Found: [{:?}]", text[0]);
                 continue;
             }
-            return Ok(href_absolute);
+            return Ok(href.to_string());
 
             //    } else {
             //        println!("href contains: [{}]", href);
@@ -208,13 +224,13 @@ mod tests {
         let html = String::from(
             r#"<div class="[pointer-events:all]"><a href="/bible/59/GEN.2.ESV"><div class="flex items-center justify-center bg-white z-[5] h-[50px] w-[50px] rounded-full border-gray-15 border-small border-solid shadow-light-1 active:shadow-dark-1 active:bg-gray-5 active:border-b-gray-5"><svg width="25" height="25" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg" aria-labelledby="Next Chapter" class="text-gray-25"><title id="Next Chapter">Next Chapter</title><path fill-rule="evenodd" clip-rule="evenodd" d="M8.293 18.707a1 1 0 0 1 0-1.414l4.94-4.94a.5.5 0 0 0 0-.707l-4.94-4.939a1 1 0 0 1 1.414-1.414l5.647 5.646a1.5 1.5 0 0 1 0 2.122l-5.647 5.646a1 1 0 0 1-1.414 0Z" fill="currentColor"></path></svg></div></a></div>"#,
         );
-        let r = find_next_chapter(&html, "https://bible.com/bible/link1");
+        let r = find_next_chapter(&html);
         if !r.is_ok() {
             println!("DEBUG result = {:?}", r);
         }
         assert!(r.is_ok());
         let link = r.unwrap();
-        assert_eq!(link, "https://bible.com/bible/59/GEN.2.ESV");
+        assert_eq!(link, "/bible/59/GEN.2.ESV");
     }
 
     /// This tests when the HTML has been tidied.
@@ -235,12 +251,12 @@ mod tests {
         </a>
         </div>"#,
         );
-        let r = find_next_chapter(&html, "https://bible.com/bible/link1");
+        let r = find_next_chapter(&html);
         if !r.is_ok() {
             println!("DEBUG result = {:?}", r);
         }
         assert!(r.is_ok());
         let link = r.unwrap();
-        assert_eq!(link, "https://bible.com/bible/59/GEN.2.ESV");
+        assert_eq!(link, "/bible/59/GEN.2.ESV");
     }
 }