Browse Source

Working on verse parsing.

Steve Thielemann 1 month ago
parent
commit
7eab69e67f
2 changed files with 100 additions and 4 deletions
  1. 3 1
      src/main.rs
  2. 97 3
      src/parse.rs

+ 3 - 1
src/main.rs

@@ -223,6 +223,8 @@ fn main() -> Result<()> {
 
             let mut extractor = |file| {
                 println!("File: {}", file);
+                parse::extract_basic_verses(filepath.join(file).to_str().unwrap());
+                println!("----->");
                 /*
                 let mut filepath = cli.work.clone();
                 filepath = filepath.join(file);
@@ -252,7 +254,7 @@ fn main() -> Result<()> {
                 for span in document.select(&span_data_usfm) {
                     // This will always be successful.
                     if let Some(data) = span.attr("data-usfm") {
-                        // There can be multples of these with matching values.
+                        // There can be multiples of these with matching values.
                         println!("data-usfm {}:", data);
 
                         let lines: String = span

+ 97 - 3
src/parse.rs

@@ -1,7 +1,8 @@
 use anyhow::{bail, Result};
+use regex::Regex;
 use scraper;
 use scraper::Element;
-use regex::Regex;
+use std::string::String;
 
 pub fn find_versions(html: &String) -> Result<String> {
     let document = scraper::Html::parse_document(&html);
@@ -11,7 +12,10 @@ pub fn find_versions(html: &String) -> Result<String> {
         let text = element_text(a);
 
         if version_match.is_match(&text) {
-            return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text));
+            return Ok(format!(
+                "Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}",
+                text, text
+            ));
         }
     }
     bail!("Could not locate a version string.");
@@ -34,6 +38,7 @@ pub struct VerseOfDay {
     pub reference: String,
 }
 
+/// Extract element text, trimmed of whitespace.
 fn element_text(element: scraper::ElementRef<'_>) -> String {
     let text = element
         .text()
@@ -43,6 +48,33 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
     text
 }
 
+/// Extract element verse text
+/// 
+/// This trims the elements, (translating "  " to "\n").
+/// Joins with a single space.
+fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
+    let span_class = scraper::Selector::parse("span[class]").unwrap();
+    let text: String = element
+    .select(&span_class)
+    .filter(|e| {
+        if let Some(c) = e.attr("class") {
+            return c.contains("content");
+        }
+        false
+    })
+    .map(|e| {
+        let text: String = e.text().collect::<String>();
+        if text == "  " {
+            return String::from("\n");
+        } else {
+            return text.trim().to_string();
+        }
+    })
+    .collect::<Vec<String>>()
+    .join(" ");
+    text
+}
+
 pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
     let document = scraper::Html::parse_document(&html);
     // let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap();
@@ -96,7 +128,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
         }
     }
 
-     // Previous ones are in div[class="mlb-2"]
+    // Previous ones are in div[class="mlb-2"]
 
     let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap();
     let a_selector1 =
@@ -124,6 +156,68 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
     Ok(result)
 }
 
+pub struct BasicVerse {
+    pub book: String,
+    pub chapter_verse: String,
+    pub verse: String,
+}
+
+pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
+    let result = Vec::<BasicVerse>::new();
+    let buffer = std::fs::read_to_string(filename)?;
+    let document = scraper::Html::parse_document(&buffer);
+    let h1_selector = scraper::Selector::parse("h1").unwrap();
+    let h1 = document.select(&h1_selector).next().unwrap();
+    println!("Heading: {}", element_text(h1));
+
+    let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
+    let span_class_selector = scraper::Selector::parse("span[class]").unwrap();
+    let _span_class_content_selector =
+        scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
+
+    for span in document.select(&span_data_usfm_selector) {
+        // This will always be successful.
+        if let Some(data) = span.attr("data-usfm") {
+            // "GEN.1.2"
+            let _parts = data.split(".").collect::<Vec<_>>();
+            // GEN, 1, 2
+            // But, there's some books that don't have chapters.  Beware!
+            let text_try = verse_element_text(span);
+            println!("trying: {:?}", text_try);
+
+            // There can be multiples of these with matching values.
+            let lines: String = span
+                .select(&span_class_selector)
+                .filter(|x| {
+                    if let Some(c) = x.attr("class") {
+                        if c.contains("content") {
+                            return true;
+                        }
+                    }
+                    false
+                })
+                .map(|x| {
+                    println!("x = {:?}", element_text(x));
+                    let init = String::new();
+                    let j = x.text().fold(init, |acc, x| {
+                        let mut s = acc;
+                        if x == "  " {
+                            s.push_str("\n");
+                        } else {
+                            s.push_str(x);
+                        }
+                        s
+                    });
+                    j
+                })
+                .collect();
+
+            println!("data {} lines {:?}", data, lines);
+        }
+    }
+    Ok(result)
+}
+
 pub fn find_next_chapter(html: &String) -> Result<String> {
     let document = scraper::Html::parse_document(html);
     // let a_selector = scraper::Selector::parse("div>a").unwrap();