3 months ago · 7eab69e67f
--- a/src/main.rs
+++ b/src/main.rs
@@ -223,6 +223,8 @@ fn main() -> Result<()> {
 
				 
			
 
				             let mut extractor = |file| {
			
 
				                 println!("File: {}", file);
			
 
				+                parse::extract_basic_verses(filepath.join(file).to_str().unwrap());
			
 
				+                println!("----->");
			
 
				                 /*
			
 
				                 let mut filepath = cli.work.clone();
			
 
				                 filepath = filepath.join(file);
			
@@ -252,7 +254,7 @@ fn main() -> Result<()> {
 
				                 for span in document.select(&span_data_usfm) {
			
 
				                     // This will always be successful.
			
 
				                     if let Some(data) = span.attr("data-usfm") {
			
 
				-                        // There can be multples of these with matching values.
			
 
				+                        // There can be multiples of these with matching values.
			
 
				                         println!("data-usfm {}:", data);
			
 
				 
			
 
				                         let lines: String = span
			
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -1,7 +1,8 @@
 
				 use anyhow::{bail, Result};
			
 
				+use regex::Regex;
			
 
				 use scraper;
			
 
				 use scraper::Element;
			
 
				-use regex::Regex;
			
 
				+use std::string::String;
			
 
				 
			
 
				 pub fn find_versions(html: &String) -> Result<String> {
			
 
				     let document = scraper::Html::parse_document(&html);
			
@@ -11,7 +12,10 @@ pub fn find_versions(html: &String) -> Result<String> {
 
				         let text = element_text(a);
			
 
				 
			
 
				         if version_match.is_match(&text) {
			
 
				-            return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text));
			
 
				+            return Ok(format!(
			
 
				+                "Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}",
			
 
				+                text, text
			
 
				+            ));
			
 
				         }
			
 
				     }
			
 
				     bail!("Could not locate a version string.");
			
@@ -34,6 +38,7 @@ pub struct VerseOfDay {
 
				     pub reference: String,
			
 
				 }
			
 
				 
			
 
				+/// Extract element text, trimmed of whitespace.
			
 
				 fn element_text(element: scraper::ElementRef<'_>) -> String {
			
 
				     let text = element
			
 
				         .text()
			
@@ -43,6 +48,33 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
 
				     text
			
 
				 }
			
 
				 
			
 
				+/// Extract element verse text
			
 
				+/// 
			
 
				+/// This trims the elements, (translating "  " to "\n").
			
 
				+/// Joins with a single space.
			
 
				+fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
			
 
				+    let span_class = scraper::Selector::parse("span[class]").unwrap();
			
 
				+    let text: String = element
			
 
				+    .select(&span_class)
			
 
				+    .filter(|e| {
			
 
				+        if let Some(c) = e.attr("class") {
			
 
				+            return c.contains("content");
			
 
				+        }
			
 
				+        false
			
 
				+    })
			
 
				+    .map(|e| {
			
 
				+        let text: String = e.text().collect::<String>();
			
 
				+        if text == "  " {
			
 
				+            return String::from("\n");
			
 
				+        } else {
			
 
				+            return text.trim().to_string();
			
 
				+        }
			
 
				+    })
			
 
				+    .collect::<Vec<String>>()
			
 
				+    .join(" ");
			
 
				+    text
			
 
				+}
			
 
				+
			
 
				 pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
			
 
				     let document = scraper::Html::parse_document(&html);
			
 
				     // let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap();
			
@@ -96,7 +128,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
 
				         }
			
 
				     }
			
 
				 
			
 
				-     // Previous ones are in div[class="mlb-2"]
			
 
				+    // Previous ones are in div[class="mlb-2"]
			
 
				 
			
 
				     let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap();
			
 
				     let a_selector1 =
			
@@ -124,6 +156,68 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
 
				     Ok(result)
			
 
				 }
			
 
				 
			
 
				+pub struct BasicVerse {
			
 
				+    pub book: String,
			
 
				+    pub chapter_verse: String,
			
 
				+    pub verse: String,
			
 
				+}
			
 
				+
			
 
				+pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
			
 
				+    let result = Vec::<BasicVerse>::new();
			
 
				+    let buffer = std::fs::read_to_string(filename)?;
			
 
				+    let document = scraper::Html::parse_document(&buffer);
			
 
				+    let h1_selector = scraper::Selector::parse("h1").unwrap();
			
 
				+    let h1 = document.select(&h1_selector).next().unwrap();
			
 
				+    println!("Heading: {}", element_text(h1));
			
 
				+
			
 
				+    let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
			
 
				+    let span_class_selector = scraper::Selector::parse("span[class]").unwrap();
			
 
				+    let _span_class_content_selector =
			
 
				+        scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
			
 
				+
			
 
				+    for span in document.select(&span_data_usfm_selector) {
			
 
				+        // This will always be successful.
			
 
				+        if let Some(data) = span.attr("data-usfm") {
			
 
				+            // "GEN.1.2"
			
 
				+            let _parts = data.split(".").collect::<Vec<_>>();
			
 
				+            // GEN, 1, 2
			
 
				+            // But, there's some books that don't have chapters.  Beware!
			
 
				+            let text_try = verse_element_text(span);
			
 
				+            println!("trying: {:?}", text_try);
			
 
				+
			
 
				+            // There can be multiples of these with matching values.
			
 
				+            let lines: String = span
			
 
				+                .select(&span_class_selector)
			
 
				+                .filter(|x| {
			
 
				+                    if let Some(c) = x.attr("class") {
			
 
				+                        if c.contains("content") {
			
 
				+                            return true;
			
 
				+                        }
			
 
				+                    }
			
 
				+                    false
			
 
				+                })
			
 
				+                .map(|x| {
			
 
				+                    println!("x = {:?}", element_text(x));
			
 
				+                    let init = String::new();
			
 
				+                    let j = x.text().fold(init, |acc, x| {
			
 
				+                        let mut s = acc;
			
 
				+                        if x == "  " {
			
 
				+                            s.push_str("\n");
			
 
				+                        } else {
			
 
				+                            s.push_str(x);
			
 
				+                        }
			
 
				+                        s
			
 
				+                    });
			
 
				+                    j
			
 
				+                })
			
 
				+                .collect();
			
 
				+
			
 
				+            println!("data {} lines {:?}", data, lines);
			
 
				+        }
			
 
				+    }
			
 
				+    Ok(result)
			
 
				+}
			
 
				+
			
 
				 pub fn find_next_chapter(html: &String) -> Result<String> {
			
 
				     let document = scraper::Html::parse_document(html);
			
 
				     // let a_selector = scraper::Selector::parse("div>a").unwrap();