Browse Source

Added extract --output for JSON output.

Steve Thielemann 1 month ago
parent
commit
736758e4bd
4 changed files with 266 additions and 109 deletions
  1. 23 4
      README.md
  2. 39 0
      src/config.rs
  3. 132 80
      src/main.rs
  4. 72 25
      src/parse.rs

+ 23 - 4
README.md

@@ -9,10 +9,29 @@ Bible from https://www.bible.com.
 The program will update the user-agent string it uses via 
 https://www.mozilla.org/en-US/firefox/releases/ because I can.
 
+-a or --agent-update
+
+# Bible versions
+
+* ESV
+* NIV
+* YLT98
+* KJV
+* NASB2020
+* MKJV
+
+-v NIV
+
+# Relative/absolute URLs
+
+See relative_to_absolute(url: &str, href: &str) -> Result<String>
+to properly handle those.
+
 # Using
 
-Copy initial.config to app.config.  
-Make a bible directory.
-Run the fetch command.
-Run the verse command.
+* Copy initial.config to app.config.  
+* Make a bible directory.
+* Run the fetch command.
+* Run the extract command. Try the --output option.
+* Run the verse command.
 

+ 39 - 0
src/config.rs

@@ -24,3 +24,42 @@ pub fn write_config(filename: &str, config: &Configuration) -> Result<()> {
     file.write_all(data.as_bytes())?;
     Ok(())
 }
+
+#[derive(Serialize, Deserialize)]
+pub struct BasicJSON {
+    pub books: Vec<String>,
+    pub book: HashMap<String, BasicChaptersJSON>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct BasicChaptersJSON {
+    pub chapters: Vec<BasicVersesJSON>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct BasicVersesJSON {
+    pub verses: Vec<String>,
+}
+
+/* 
+impl BasicJSON {
+    pub fn new() -> Self {
+        Self { book: HashMap::new()}
+    }
+}
+*/
+
+/*
+#[derive(Serialize, Deserialize)]
+pub struct BasicVerseJSON {
+    pub verse: u8,
+    pub text: String,
+}
+*/
+
+pub fn save_basic_json(filename: &str, json: &BasicJSON) -> Result<()> {
+    let data = serde_json::to_string_pretty(json)?;
+    let mut file = File::create(filename)?;
+    file.write_all(data.as_bytes())?;
+    Ok(())
+}

+ 132 - 80
src/main.rs

@@ -1,5 +1,6 @@
 use anyhow::Result; // , Context};
 use clap::{Parser, Subcommand};
+use config::save_basic_json;
 use reqwest;
 use scraper;
 use std::{
@@ -46,12 +47,12 @@ enum Commands {
     /// Extract information from cached files
     Extract {
         /// Count
-        #[arg(short, long, default_value = "5")]
-        count: u32,
+        #[arg(short, long)] // , default_value = "5")]
+        count: Option<u32>,
 
-        /// All
-        #[arg(short, long, action=clap::ArgAction::SetTrue)]
-        all: bool,
+        /// Output file
+        #[arg(short, long)]
+        output: Option<PathBuf>,
     },
     /// Verse of the day
     Verse {
@@ -64,7 +65,7 @@ enum Commands {
 }
 
 /// Configuration filename
-const CONFIG_FILE : &str = "app.config";
+const CONFIG_FILE: &str = "app.config";
 
 /// Verse of the Day URL
 static VOD_URL: &str = "https://www.bible.com/verse-of-the-day";
@@ -80,6 +81,77 @@ static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
     ])
 });
 
+static BOOK_NAMES: LazyLock<Vec<&str>> = LazyLock::new(|| {
+    Vec::from([
+        "Genesis",
+        "Exodus",
+        "Leviticus",
+        "Numbers",
+        "Deuteronomy",
+        "Joshua",
+        "Judges",
+        "Ruth",
+        "1 Samuel",
+        "2 Samuel",
+        "1 Kings",
+        "2 Kings",
+        "1 Chronicles",
+        "2 Chronicles",
+        "Ezra",
+        "Nehemiah",
+        "Esther",
+        "Job",
+        "Psalm",
+        "Proverbs",
+        "Ecclesiastes",
+        "Song of Solomon",
+        "Isaiah",
+        "Jeremiah",
+        "Lamentations",
+        "Ezekiel",
+        "Daniel",
+        "Hosea",
+        "Joel",
+        "Amos",
+        "Obadiah",
+        "Jonah",
+        "Micah",
+        "Nahum",
+        "Habakkuk",
+        "Zephaniah",
+        "Haggai",
+        "Zechariah",
+        "Malachi",
+        "Matthew",
+        "Mark",
+        "Luke",
+        "John",
+        "Acts",
+        "Romans",
+        "1 Corinthians",
+        "2 Corinthians",
+        "Galatians",
+        "Ephesians",
+        "Philippians",
+        "Colossians",
+        "1 Thessalonians",
+        "2 Thessalonians",
+        "1 Timothy",
+        "2 Timothy",
+        "Titus",
+        "Philemon",
+        "Hebrews",
+        "James",
+        "1 Peter",
+        "2 Peter",
+        "1 John",
+        "2 John",
+        "3 John",
+        "Jude",
+        "Revelation",
+    ])
+});
+
 static BOOK_MAP: LazyLock<HashMap<&str, usize>> =
     LazyLock::new(|| HashMap::from_iter(BOOKS.iter().enumerate().map(|x| (*x.1, x.0 + 1))));
 
@@ -214,102 +286,82 @@ fn main() -> Result<()> {
             println!("I'm finished fetching!");
         }
 
-        Some(Commands::Extract { count, all }) => {
+        Some(Commands::Extract { count, output }) => {
             println!("Extract...");
             let files = find_files(cli.work.to_str().unwrap(), cli.version.as_str());
             let filepath = Path::new(&cli.work);
 
-            let mut chapters: HashMap<String, String> = HashMap::<String, String>::new();
+            // let mut chapters: HashMap<String, String> = HashMap::<String, String>::new();
+            let mut json_output = config::BasicJSON {
+                books: Vec::new(),
+                book: HashMap::new(),
+            };
 
             let mut extractor = |file| {
                 println!("File: {}", file);
-                println!("BV: {:?}", parse::extract_basic_verses(filepath.join(file).to_str().unwrap()));
-                println!("----->");
-                /*
-                let mut filepath = cli.work.clone();
-                filepath = filepath.join(file);
-                */
-                let buffer = std::fs::read_to_string(filepath.join(file)).unwrap();
-                let document = scraper::Html::parse_document(&buffer);
-
-                let h1_selector = scraper::Selector::parse("h1").unwrap();
-                let h1 = document.select(&h1_selector).next().unwrap();
-                println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
-
-                // https://programmersportal.com/the-complete-css-selectors-cheat-sheet-with-examples-and-pdf/
-
-                // let span_selector = scraper::Selector::parse("span").unwrap();
-                let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
-                // parse r#"div>a[href ^="/bible/"]"#
-                let span_class = scraper::Selector::parse("span[class]").unwrap();
-                // span[class="ChapterContent_content__RrUqA"]
-                // let span_class_content = scraper::Selector::parse(r#"span[class~="content"]"#).unwrap();
-                // OK!  ~= probably locates a matching attr line <span class="this that content"> but does not
-                // match <span class="contains_content">!
+                let bv =
+                    parse::extract_basic_verses(filepath.join(file).to_str().unwrap()).unwrap();
+                // println!("BV: {:?}", bv);
+                for bv_item in bv {
+                    if !json_output.book.contains_key(&bv_item.book) {
+                        // book missing
+                        json_output.book.insert(
+                            bv_item.book.clone(),
+                            config::BasicChaptersJSON {
+                                chapters: Vec::new(),
+                            },
+                        );
+                        json_output.books.push(bv_item.book.clone());
+                    }
+                    // Book exists now.
 
-                let _span_class_content =
-                    scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#)
-                        .unwrap();
+                    let chapter = bv_item.chapter;
+                    let verse = bv_item.verse;
 
-                for span in document.select(&span_data_usfm) {
-                    // This will always be successful.
-                    if let Some(data) = span.attr("data-usfm") {
-                        // There can be multiples of these with matching values.
-                        println!("data-usfm {}:", data);
+                    if let Some(mbook) = json_output.book.get_mut(&bv_item.book) {
+                        while chapter as usize > mbook.chapters.len() {
+                            // Chapter is missing.
+                            mbook
+                                .chapters
+                                .push(config::BasicVersesJSON { verses: Vec::new() });
+                        }
 
-                        let lines: String = span
-                            .select(&span_class)
-                            // Only allow elements with attr class that containts "content"
-                            .filter(|x| {
-                                if let Some(c) = x.attr("class") {
-                                    if c.contains("content") {
-                                        return true;
-                                    }
-                                }
-                                false
-                            })
-                            .map(|x| {
-                                // Convert element's text() iterator into a string.
-                                let init = String::new();
-                                let j = x.text().fold(init, |acc, x| {
-                                    // print!( ">> {}<< ", x);
-                                    let mut s = acc;
-                                    if x == "  " {
-                                        // This would be a break/newline.
-                                        s.push_str("\n");
-                                    } else {
-                                        s.push_str(x);
-                                    }
-                                    s
-                                });
-                                // println!("j = {}", j);
-                                j
-                            })
-                            .collect();
-
-                        println!("data {} lines {}", data, lines);
-                        if chapters.contains_key(data) {
-                            chapters.get_mut(data).unwrap().push_str(&lines);
-                        } else {
-                            chapters.insert(data.to_string(), lines);
+                        let mverses = &mut mbook.chapters[chapter as usize - 1].verses;
+                        while verse as usize > mverses.len() {
+                            mverses.push(String::new());
                         }
+                        mverses[verse as usize - 1] = bv_item.text;
                     }
                 }
             };
 
-            if *all {
-                println!("Extract All:");
-                for file in files.iter() {
+            if let Some(count) = *count {
+                // Ok, they gave us a value.  Use it.
+                println!("Extract {}:", count);
+                for file in files.iter().take(count as usize) {
                     extractor(file);
                 }
             } else {
-                println!("Extract {}:", *count);
-                for file in files.iter().take(*count as usize) {
+                println!("Extract All:");
+                for file in files.iter() {
                     extractor(file);
                 }
             }
 
-            println!("Chapters: {:?}", chapters);
+            if let Some(output) = output {
+                // Ok, they gave us a file to use.
+                println!("Saving output: {}", output.to_str().unwrap());
+                save_basic_json(output.to_str().unwrap(), &json_output)?;
+            }
+
+            // println!("Chapters: {:?}", chapters);
+
+            /*
+            // What happened here?
+
+            jq .book.John.chapters[2].verses[15] < test.json
+            "\n “For \n \n God so loved \n \n the world, \n \n \n that he gave his only Son, that whoever believes in him should not \n \n perish but have eternal life. \n"
+             */
 
             /*
             "AMO.8.9": "“And on that day,” declares the Lord God,\n“I will make the sun go down at noonand darken the earth in broad daylight.\n"}

+ 72 - 25
src/parse.rs

@@ -49,29 +49,29 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
 }
 
 /// Extract element verse text
-/// 
+///
 /// This trims the elements, (translating "  " to "\n").
 /// Joins with a single space.
 fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
     let span_class = scraper::Selector::parse("span[class]").unwrap();
     let text: String = element
-    .select(&span_class)
-    .filter(|e| {
-        if let Some(c) = e.attr("class") {
-            return c.contains("content");
-        }
-        false
-    })
-    .map(|e| {
-        let text: String = e.text().collect::<String>();
-        if text == "  " {
-            return String::from("\n");
-        } else {
-            return text.trim().to_string();
-        }
-    })
-    .collect::<Vec<String>>()
-    .join(" ");
+        .select(&span_class)
+        .filter(|e| {
+            if let Some(c) = e.attr("class") {
+                return c.contains("content");
+            }
+            false
+        })
+        .map(|e| {
+            let text: String = e.text().collect::<String>();
+            if text == "  " {
+                return String::from("\n");
+            } else {
+                return text.trim().to_string();
+            }
+        })
+        .collect::<Vec<String>>()
+        .join(" ");
     text
 }
 
@@ -159,20 +159,24 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
 #[derive(Debug)]
 pub struct BasicVerse {
     pub book: String,
-    pub chapter_verse: String,
-    pub verse: String,
+    pub chapter: u8,
+    pub verse: u8,
+    // pub chapter_verse: String,
+    pub text: String,
 }
 
 pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
-    let result = Vec::<BasicVerse>::new();
+    let mut result = Vec::<BasicVerse>::new();
     let buffer = std::fs::read_to_string(filename)?;
     let document = scraper::Html::parse_document(&buffer);
     let h1_selector = scraper::Selector::parse("h1").unwrap();
     let h1 = document.select(&h1_selector).next().unwrap();
-    println!("Heading: {}", element_text(h1));
+    let mut book = element_text(h1);
+    // println!("Heading: {}", element_text(h1));
+    let mut book_trim = true;
 
     let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
-    let span_class_selector = scraper::Selector::parse("span[class]").unwrap();
+    let _span_class_selector = scraper::Selector::parse("span[class]").unwrap();
     let _span_class_content_selector =
         scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
 
@@ -180,14 +184,56 @@ pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
         // This will always be successful.
         if let Some(data) = span.attr("data-usfm") {
             // "GEN.1.2"
-            let _parts = data.split(".").collect::<Vec<_>>();
+            // let ch_ver = data.split(".").skip(1).collect::<String>();
+
+            let parts = data.split(".").skip(1).collect::<Vec<_>>();
+            let mut chapter_number: u8 = 0;
+            if parts.len() == 2 {
+                chapter_number = parts[0].parse()?;
+            }
+
+            if book_trim {
+                // Only trim the book once.
+                book_trim = false;
+                if chapter_number != 0 {
+                    // Remove chapter number from book.
+                    while book.pop() != Some(' ') {
+                        // Check for a problem.
+                        if book.is_empty() {
+                            bail!(format!("Failed to trim the chapter from [{}].", element_text(h1)));
+                        }
+                    }
+                }
+            }
+
+            let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
+
             // GEN, 1, 2
             // But, there's some books that don't have chapters.  Beware!
             let text_try = verse_element_text(span);
             // This looks good.  ;)
-            println!("trying: {:?}", text_try);
+            // println!("{} text: {:?}", data, text_try);
 
+            if let Some(b) = result.get_mut(verse_number as usize - 1) {
+                // Yes, it already exists...
+                // It seems like these should be joined with "\n" instead of " ".
 
+                if !(*b).text.ends_with("\n") && !text_try.starts_with("\n") {
+                    (*b).text.push_str("\n");
+                }
+                b.text.push_str(text_try.as_str());
+            } else {
+                let bv = BasicVerse {
+                    book: book.clone(),
+                    chapter: chapter_number,
+                    verse: verse_number,
+                    text: text_try,
+                };
+
+                result.push(bv);
+            }
+
+            /*
             // There can be multiples of these with matching values.
             let lines: String = span
                 .select(&span_class_selector)
@@ -216,6 +262,7 @@ pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
                 .collect();
 
             println!("data {} lines {:?}", data, lines);
+            */
         }
     }
     Ok(result)