Browse Source

Working WalkerParser.

Merges content (verses), but only if they can.
(It won't merge quote:true and quote:false.)
Steve Thielemann 1 month ago
parent
commit
6f31a912ee
3 changed files with 308 additions and 164 deletions
  1. 18 40
      src/config.rs
  2. 12 10
      src/main.rs
  3. 278 114
      src/parse.rs

+ 18 - 40
src/config.rs

@@ -32,6 +32,7 @@ pub struct BasicJSON {
 }
 
 impl BasicJSON {
+    #[allow(dead_code)]    
     pub fn new() -> Self {
         Self {
             books: Vec::new(),
@@ -41,7 +42,7 @@ impl BasicJSON {
 
     /// Add new book, return mutable instance of it.
     pub fn new_book(&mut self, name: &String) -> &mut BasicChaptersJSON {
-        if ! self.book.contains_key(name) {
+        if !self.book.contains_key(name) {
             self.book.insert(name.clone(), BasicChaptersJSON::new());
         }
         self.book.get_mut(name).unwrap()
@@ -60,7 +61,7 @@ pub struct BasicChaptersJSON {
 impl BasicChaptersJSON {
     pub fn new() -> Self {
         Self {
-            chapters: Vec::new()
+            chapters: Vec::new(),
         }
     }
 
@@ -68,62 +69,39 @@ impl BasicChaptersJSON {
         while self.chapters.len() < index {
             self.chapters.push(BasicVersesJSON::new());
         }
-        self.chapters.get_mut(index-1).unwrap()
+        self.chapters.get_mut(index - 1).unwrap()
     }
 }
 
 #[derive(Serialize, Deserialize, Debug)]
 pub struct BasicVersesJSON {
-    pub verses: Vec<BasicVerseJSON>,
+    pub verses: Vec<Vec<BasicVerseJSON>>,
 }
 
 impl BasicVersesJSON {
     pub fn new() -> Self {
-        Self {
-            verses: Vec::new()
-        }
+        Self { verses: Vec::new() }
     }
 
-    pub fn new_verse(&mut self, index: usize) -> &mut BasicVerseJSON {
+    pub fn verse(&mut self, index: usize) -> &mut Vec<BasicVerseJSON> {
         while self.verses.len() < index {
-            self.verses.push(BasicVerseJSON::new());
+            self.verses.push(Vec::new());
         }
-        self.verses.get_mut(index-1).unwrap()
+        self.verses.get_mut(index - 1).unwrap()
     }
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-pub struct BasicVerseJSON {
-    pub start_paragraph: bool,
-    pub heading: Option<String>,
-    pub verse: String,
-}
-
-impl BasicVerseJSON {
-    pub fn new() -> Self {
-        Self {
-            start_paragraph: false,
-            heading: None,
-            verse: String::new(),
-        }
-    }
-}
-
-/*
-impl BasicJSON {
-    pub fn new() -> Self {
-        Self { book: HashMap::new()}
-    }
-}
-*/
-
-/*
-#[derive(Serialize, Deserialize)]
-pub struct BasicVerseJSON {
-    pub verse: u8,
-    pub text: String,
+pub enum BasicVerseJSON {
+    Heading(String),
+    Note(String),
+    Verse {
+        text: String,
+        paragraph: bool,
+        quote: bool,
+        red: bool,
+    },
 }
-*/
 
 pub fn save_basic_json(filename: &str, json: &BasicJSON) -> Result<()> {
     let data = serde_json::to_string_pretty(json)?;

+ 12 - 10
src/main.rs

@@ -304,10 +304,10 @@ fn main() -> Result<()> {
 
             let mut last_book = String::new();
             
-            let mut extractor = |file| {
+            let mut extractor = |file| -> Result<()> {
                 println!("File: {}", file);
                 let bv =
-                    parse::extract_verses(filepath.join(file).to_str().unwrap()).unwrap();
+                    parse::extract_verses(filepath.join(file).to_str().unwrap())?;
 
                 println!("Book {} Chapter {} BV: {:?}", bv.0, bv.1, bv.2);
                 if bv.0 != last_book {
@@ -315,14 +315,15 @@ fn main() -> Result<()> {
                     json_output.add_to_books(&bv.0);
                 }
 
-                let mut json_book = json_output.new_book(&bv.0);
-                let mut chapter = json_book.new_chapter(bv.1 as usize);
+                let json_book = json_output.new_book(&bv.0);
+                let chapter = json_book.new_chapter(bv.1 as usize);
 
                 for (idx, bv_item) in bv.2.verses.into_iter().enumerate() {
-                    let mut verse = chapter.new_verse(idx+1);
-                    verse.heading = bv_item.heading;
-                    verse.start_paragraph = bv_item.start_paragraph;
-                    verse.verse = bv_item.verse;
+                    let verse = chapter.verse(idx+1);
+                    for bvi in bv_item {
+                        verse.push(bvi);
+                    }
+                    // verse = bv_item.clone();
                 }
 
                 /* 
@@ -360,18 +361,19 @@ fn main() -> Result<()> {
                     }
                 }
                 */
+                Ok(())
             };
 
             if let Some(count) = *count {
                 // Ok, they gave us a value.  Use it.
                 println!("Extract {}:", count);
                 for file in files.iter().take(count as usize) {
-                    extractor(file);
+                    extractor(file)?;
                 }
             } else {
                 println!("Extract All:");
                 for file in files.iter() {
-                    extractor(file);
+                    extractor(file)?;
                 }
             }
 

+ 278 - 114
src/parse.rs

@@ -3,7 +3,7 @@ use anyhow::{bail, Context, Result};
 use regex::Regex;
 use scraper;
 use scraper::Element;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::string::String;
 
 pub fn find_versions(html: &String) -> Result<String> {
@@ -50,10 +50,12 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
     text
 }
 
+
 /// Extract element verse text
 ///
 /// This trims the elements, (translating "  " to "\n").
 /// Joins with a single space.
+#[allow(dead_code)]
 fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
     let span_class = scraper::Selector::parse("span[class]").unwrap();
     let text: String = element
@@ -158,6 +160,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
     Ok(result)
 }
 
+/* 
 #[derive(Debug)]
 pub struct BasicVerse {
     pub book: String,
@@ -166,6 +169,7 @@ pub struct BasicVerse {
     // pub chapter_verse: String,
     pub text: String,
 }
+*/
 
 fn parse_html_file(filename: &str) -> Result<scraper::Html> {
     let buffer =
@@ -177,18 +181,22 @@ fn parse_html_file(filename: &str) -> Result<scraper::Html> {
 // If I could build a structure of the chapter, maybe I could parse it?
 // I would at least know what to look for...
 
-fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
+/// Display the structure of the HTML
+///
+/// This shows a properly indented layout of the HTML tags.
+/// It shows what is nested in what, and what attributes the element
+/// has.  (And it doesn't delete empty tags like html tidy does.)
+pub fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
     // For output formatting.
     let spacer = " ".repeat(depth as usize * 4);
 
     // This can be multiple classes, so watch out here.
-    let cls = element.attr("class").unwrap();
+    // let cls = element.attr("class").unwrap();
     println!(
-        "{} {} E {} {} {:?}",
+        "{} {} E {} {:?}",
         depth,
         spacer,
         element.value().name(),
-        cls,
         element.value()
     );
 
@@ -201,9 +209,11 @@ fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
     }
 }
 
+/// Verse information
 #[derive(Debug)]
 pub enum VerseInformation {
     Heading(String),
+    /// Chapter and Verse "3.16"
     ChapterVerse(String),
     Content {
         text: String,
@@ -211,9 +221,11 @@ pub enum VerseInformation {
         paragraph: bool,
         red: bool,
     },
+    /// Verse note
     Note(String),
 }
 
+/*
 /// Clean element class, and return in a set.
 ///
 /// Classes that have __ in them are returned without the __ and ...
@@ -408,6 +420,206 @@ fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInf
     walker(element, &mut result, &mut classes, &mut track, 0);
     result
 }
+*/
+
+pub struct WalkerParser {
+    results: Vec<VerseInformation>,
+    classes: HashSet<String>,
+    paragraph: bool,
+    chapter_verse: String,
+}
+
+impl WalkerParser {
+    pub fn new() -> Self {
+        Self {
+            results: Vec::<VerseInformation>::new(),
+            classes: HashSet::<String>::new(),
+            paragraph: false,
+            chapter_verse: String::new(),
+        }
+    }
+
+    /// Reset the parser's internal state.
+    pub fn clear(&mut self) {
+        self.results.clear();
+        self.classes.clear();
+        self.paragraph = false;
+        self.chapter_verse.clear();
+    }
+
+    /// Extract element text, trimmed of whitespace.
+    fn element_text(element: scraper::ElementRef<'_>) -> String {
+        let text = element
+            .text()
+            .map(|s| s.trim_matches(char::is_whitespace))
+            .filter(|x| !x.is_empty())
+            .collect::<String>();
+        text
+    }
+
+    /// Clean element class, and return in a set.
+    ///
+    /// Classes that have __ in them are returned without the __ and ...
+    fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String> {
+        let mut result = HashSet::<String>::new();
+        if let Some(e_class) = element.attr("class") {
+            for c in e_class.split(" ") {
+                if let Some(chapter) = c.split_once("__") {
+                    result.insert(chapter.0.to_string());
+                } else {
+                    result.insert(c.to_string());
+                }
+            }
+        }
+        result
+    }
+
+    /// Add note
+    ///
+    /// This will append to a previous note, if the last item in result
+    /// is a VerseInformation::Note.
+    fn add_note(&mut self, note: &str) {
+        if let Some(last) = self.results.last_mut() {
+            if let VerseInformation::Note(n) = last {
+                n.push_str(" ");
+                n.push_str(note);
+                return;
+            }
+        }
+        self.results.push(VerseInformation::Note(note.to_string()));
+    }
+
+    fn add_content(&mut self, c: VerseInformation) {
+        if let VerseInformation::Content{text: ref c_text, quoted: c_q, paragraph: c_p, red: c_r} = c {
+            // I have the Content in a more usable form.
+            let mut insert = false;
+            if let Some(last) = self.results.last_mut() {
+                if let VerseInformation::Content { text: l_text, quoted: l_q, paragraph: l_p, red: l_r } = last {
+                    if *l_q != c_q || *l_r != c_r {
+                        insert = true;
+                    }       
+                    if c_p {
+                        insert = true;
+                    }
+                    // Tests are done.
+                    if !insert {
+                        l_text.push_str(" ");
+                        l_text.push_str(&c_text);
+                        return;
+                    }
+                }
+            }
+            self.results.push(c);
+        } else {
+            panic!("Expected VerseInformation::Content not {:?}", c);
+        }
+    }
+
+    /// Recursively called to handle child elements.
+    ///
+    /// self.classes contains the parent's classes.
+    /// class_hash contains the current element's classes.
+    fn recursive_walker(&mut self, element: scraper::element_ref::ElementRef<'_>) {
+        let class_hash = Self::clean_class(element);
+        if self.classes.contains("ChapterContent_note") {
+            // We're in the note.
+
+            if class_hash.contains("ChapterContent_body") {
+                // Note body.
+                let mut has_children = false;
+                for child in element.child_elements() {
+                    has_children = true;
+                    if let Some(cl) = child.attr("class") {
+                        if cl.contains("_label__") || cl.contains("_fr__") {
+                            continue;
+                        }
+                    }
+                    let text = Self::element_text(child);
+                    if !text.is_empty() {
+                        self.add_note(&Self::element_text(child));
+                    }
+                }
+
+                if !has_children {
+                    let text = Self::element_text(element);
+                    if !text.is_empty() {
+                        self.add_note(&text);
+                    }
+                }
+                // Since we've handled children elements here, we're done here.
+                return;
+            }
+        }
+
+        if class_hash.contains("ChapterContent_verse") {
+            if let Some(ch_v) = element.attr("data-usfm") {
+                if self.chapter_verse != ch_v {
+                    self.chapter_verse = ch_v.to_string();
+                    self.results
+                        .push(VerseInformation::ChapterVerse(ch_v.to_string()));
+                }
+            }
+        }
+
+        if class_hash.contains("ChapterContent_content") {
+            // Content.
+            let quoted = self.classes.contains("ChapterContent_q1")
+                || self.classes.contains("ChapterContent_q2");
+            let red = self.classes.contains("ChapterContent_wj")
+                || self.classes.contains("ChapterContent_wordsofchrist");
+            let text = Self::element_text(element);
+            if !text.is_empty() {
+                let paragraph = self.paragraph;
+                if paragraph {
+                    self.paragraph = false;
+                }
+
+                self.add_content(
+                VerseInformation::Content {
+                    text,
+                    quoted,
+                    paragraph,
+                    red,
+                });
+            }
+        }
+
+        if class_hash.contains("ChapterContent_heading") {
+            let text = Self::element_text(element);
+            if !text.is_empty() {
+                self.results.push(VerseInformation::Heading(text));
+            }
+        }
+
+        if class_hash.contains("ChapterContent_p") {
+            self.paragraph = true;
+        }
+
+        // Unfortunately, has_children always returns true?
+
+        if element.has_children() {
+            // Add element classes to class tracker.
+            for element_class in class_hash.iter() {
+                self.classes.insert(element_class.clone());
+            }
+
+            for child in element.child_elements() {
+                self.recursive_walker(child);
+            }
+
+            for element_class in class_hash.iter() {
+                self.classes.remove(element_class);
+            }
+        }
+    }
+
+    /// Parse the element (and children) into VerseInformation.
+    pub fn parse(&mut self, element: scraper::element_ref::ElementRef<'_>) -> &[VerseInformation] {
+        self.clear();
+        self.recursive_walker(element);
+        self.results.as_slice()
+    }
+}
 
 /// Extract just the Chapter's verses.
 ///
@@ -431,6 +643,8 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
     }
 
     let mut chapter_number: u8 = 0;
+    let mut verse_number: u8 = 0;
+    let mut walker = WalkerParser::new();
 
     // Locate the div that contains all of the chapter verses
     let chapter_selector = scraper::Selector::parse(r#"div[class*="_chapter__"]"#).unwrap();
@@ -439,9 +653,66 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
 
         // This works amazingly well for showing how the html is structured.
         show_structure(chapter, 0);
+        let results = walker.parse(chapter);
+
+        println!("Elements: {:?}", results);
 
-        println!("Elements: {:?}", element_walker(chapter));
+        let mut heading = String::new();
 
+        for r in results {
+            match r {
+                VerseInformation::Heading(h) => {
+                    heading = h.clone();
+                }
+                VerseInformation::ChapterVerse(cv) => {
+                    let parts = cv.split(".").collect::<Vec<_>>();
+                    chapter_number = parts[1].parse().unwrap();
+                    verse_number = parts[2].parse().unwrap();
+
+                    if !heading.is_empty() {
+                        let v = result.verse(verse_number as usize);
+                        v.push(config::BasicVerseJSON::Heading(heading.clone()));
+                        heading.clear();
+                    }
+                    /*
+                    if result.verses.len() < verse_number as usize {
+                        bail!(
+                            "Len = {}, wanting {}",
+                            result.verses.len() + 1,
+                            verse_number
+                        );
+                    }
+                    */
+                }
+                VerseInformation::Note(n) => {
+                    if verse_number == 0 {
+                        println!("DERP! verse_number is zero! Note: {}", n);
+                    } else {
+                    let v = result.verse(verse_number as usize);
+                    v.push(config::BasicVerseJSON::Note(n.clone()));
+                    }
+                }
+                VerseInformation::Content {
+                    text,
+                    quoted,
+                    paragraph,
+                    red,
+                } => {
+                    if verse_number == 0 {
+                        println!("DERP! verse_number is zero! Content: {}!", text);
+                    } else {
+                        let v = result.verse(verse_number as usize);
+                        v.push(config::BasicVerseJSON::Verse {
+                            text: text.to_string(),
+                            paragraph: *paragraph,
+                            quote: *quoted,
+                            red: *red,
+                        });
+                    }
+                }
+            }
+        }
+        /*
         println!("Chapter: {}", chapter.html());
 
         // Look for _s1__ and _p__
@@ -680,6 +951,7 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
                 }
             }
         }
+        */
     } else {
         bail!("Unable to locate the div tag with _chapter__.");
     }
@@ -687,114 +959,6 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
     Ok((book, chapter_number, result))
 }
 
-#[deprecated(note = "This is the old version, use extract_verses")]
-pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
-    let mut result = Vec::<BasicVerse>::new();
-
-    let buffer = std::fs::read_to_string(filename)?;
-    let document = scraper::Html::parse_document(&buffer);
-    let h1_selector = scraper::Selector::parse("h1").unwrap();
-    let h1 = document.select(&h1_selector).next().unwrap();
-    let mut book = element_text(h1);
-    // println!("Heading: {}", element_text(h1));
-    let mut book_trim = true;
-
-    let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
-    let _span_class_selector = scraper::Selector::parse("span[class]").unwrap();
-    let _span_class_content_selector =
-        scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
-
-    for span in document.select(&span_data_usfm_selector) {
-        // This will always be successful.
-        if let Some(data) = span.attr("data-usfm") {
-            // "GEN.1.2"
-            // let ch_ver = data.split(".").skip(1).collect::<String>();
-
-            let parts = data.split(".").skip(1).collect::<Vec<_>>();
-            let mut chapter_number: u8 = 0;
-            if parts.len() == 2 {
-                chapter_number = parts[0].parse()?;
-            }
-
-            if book_trim {
-                // Only trim the book once.
-                book_trim = false;
-                if chapter_number != 0 {
-                    // Remove chapter number from book.
-                    while book.pop() != Some(' ') {
-                        // Check for a problem.
-                        if book.is_empty() {
-                            bail!(format!(
-                                "Failed to trim the chapter from [{}].",
-                                element_text(h1)
-                            ));
-                        }
-                    }
-                }
-            }
-
-            let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
-
-            // GEN, 1, 2
-            // But, there's some books that don't have chapters.  Beware!
-            let text_try = verse_element_text(span);
-            // This looks good.  ;)
-            // println!("{} text: {:?}", data, text_try);
-
-            if let Some(b) = result.get_mut(verse_number as usize - 1) {
-                // Yes, it already exists...
-                // It seems like these should be joined with "\n" instead of " ".
-
-                if !(*b).text.ends_with("\n") && !text_try.starts_with("\n") {
-                    (*b).text.push_str("\n");
-                }
-                b.text.push_str(text_try.as_str());
-            } else {
-                let bv = BasicVerse {
-                    book: book.clone(),
-                    chapter: chapter_number,
-                    verse: verse_number,
-                    text: text_try,
-                };
-
-                result.push(bv);
-            }
-
-            /*
-            // There can be multiples of these with matching values.
-            let lines: String = span
-                .select(&span_class_selector)
-                .filter(|x| {
-                    if let Some(c) = x.attr("class") {
-                        if c.contains("content") {
-                            return true;
-                        }
-                    }
-                    false
-                })
-                .map(|x| {
-                    println!("x = {:?}", element_text(x));
-                    let init = String::new();
-                    let j = x.text().fold(init, |acc, x| {
-                        let mut s = acc;
-                        if x == "  " {
-                            s.push_str("\n");
-                        } else {
-                            s.push_str(x);
-                        }
-                        s
-                    });
-                    j
-                })
-                .collect();
-
-            println!("data {} lines {:?}", data, lines);
-            */
-        }
-    }
-    Ok(result)
-}
-
 pub fn find_next_chapter(html: &String) -> Result<String> {
     let document = scraper::Html::parse_document(html);
     // let a_selector = scraper::Selector::parse("div>a").unwrap();