Explorar el Código

Working walker. Change to struct to clean it up.

We're passing around state in function calls...
Steve Thielemann hace 1 mes
padre
commit
9371e904d8
Se han modificado 1 ficheros con 69 adiciones y 12 borrados
  1. 69 12
      src/parse.rs

+ 69 - 12
src/parse.rs

@@ -3,7 +3,7 @@ use anyhow::{bail, Context, Result};
 use regex::Regex;
 use scraper;
 use scraper::Element;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::string::String;
 
 pub fn find_versions(html: &String) -> Result<String> {
@@ -203,8 +203,14 @@ fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
 
 #[derive(Debug)]
 pub enum VerseInformation {
+    Heading(String),
     ChapterVerse(String),
-    Content(String),
+    Content {
+        text: String,
+        quoted: bool,
+        paragraph: bool,
+        red: bool,
+    },
     Note(String),
 }
 
@@ -227,11 +233,12 @@ fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String>
 
 // This doesn't work because ft is a child of body.
 fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<VerseInformation>) {
-    let body_selector = scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
+    let body_selector =
+        scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
     let mut text = String::new();
 
     if let Some(body) = element.select(&body_selector).next() {
-    // for body in element.select(&body_selector).next() {
+        // for body in element.select(&body_selector).next() {
         if !text.is_empty() {
             text.push_str(" ");
         }
@@ -242,7 +249,7 @@ fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<Ve
     }
 }
 
-fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
+fn add_append_note(results: &mut Vec<VerseInformation>, note: &str) {
     if let Some(last) = results.last_mut() {
         if let VerseInformation::Note(n) = last {
             // Ok, the last thing is a "ChapterVerse".
@@ -252,12 +259,13 @@ fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
             results.push(VerseInformation::Note(note.to_string()));
         }
     }
-} 
+}
 
 fn walker(
     element: scraper::element_ref::ElementRef<'_>,
     results: &mut Vec<VerseInformation>,
     classes: &mut HashSet<String>,
+    track: &mut HashMap<String, String>,
     depth: u32,
 ) {
     // For output formatting.
@@ -293,7 +301,6 @@ fn walker(
                     // results.push(VerseInformation::Note(text));
                 }
                 // No children, we can return.
-                
             }
             return;
         }
@@ -302,14 +309,32 @@ fn walker(
     // let mut ch_verse = String::new();
     if class_hash.contains("ChapterContent_verse") {
         if let Some(ch_v) = element.attr("data-usfm") {
+            // I'm getting duplicate ChapterVerse items in the results now.
             // Check the last item.
+            let mut new_chv = false;
+
+            if track.contains_key("ch_v") {
+                if let Some(tchv) = track.get("ch_v") {
+                    if tchv != ch_v {
+                        new_chv = true;
+                        track.insert("ch_v".to_string(), ch_v.to_string());
+                    }
+                }
+            } else {
+                new_chv = true;
+                track.insert("ch_v".to_string(), ch_v.to_string());
+            }
+
             if let Some(last) = results.last() {
                 if let VerseInformation::ChapterVerse(_) = last {
                     // Ok, the last thing is a "ChapterVerse".  Remove it.
                     results.pop();
                 }
             }
-            results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
+
+            if new_chv {
+                results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
+            }
         }
     }
 
@@ -321,13 +346,41 @@ fn walker(
         // Should I check the classes here for:
         // _p__ Paragraph?
         // _q1__, _q2__ Quote?
+        let quoted = classes.contains("ChapterContent_q1") || classes.contains("ChapterContent_q2");
+        let red = classes.contains("ChapterContent_wj")
+            || classes.contains("ChapterContent_wordsofchrist");
+        let text = element_text(element);
+        if !text.is_empty() {
+
+            // We have something to save.  Is this start of paragraph?
+            let p = track.contains_key("p");
+            if p {
+                // Ok, we're storing it.  Reset the paragraph flag.
+                track.remove("p");
+            }
+
+            results.push(VerseInformation::Content {
+                text,
+                quoted,
+                paragraph: p,
+                red,
+            });
+        }
+    }
 
+    if class_hash.contains("ChapterContent_heading") {
         let text = element_text(element);
         if !text.is_empty() {
-            results.push(VerseInformation::Content(text));
+            results.push(VerseInformation::Heading(text));
         }
     }
 
+    if class_hash.contains("ChapterContent_p") {
+        track.insert("p".to_string(), "".to_string());
+    }
+
+    // Unfortunately, has_children always returns true...
+
     if element.has_children() {
         // Add the classes to our class tracker.
         for ch in class_hash.iter() {
@@ -335,7 +388,7 @@ fn walker(
         }
 
         for child in element.child_elements() {
-            walker(child, results, classes, depth + 1);
+            walker(child, results, classes, track, depth + 1);
         }
 
         // Remove the classes from the class tracker.
@@ -345,10 +398,14 @@ fn walker(
     }
 }
 
+// TO FIX:  Write this as a structure with impl method calls.
+// Eliminate the passing of state via the function calls.
+
 fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInformation> {
     let mut result = Vec::<VerseInformation>::new();
     let mut classes = HashSet::<String>::new();
-    walker(element, &mut result, &mut classes, 0);
+    let mut track = HashMap::<String, String>::new();
+    walker(element, &mut result, &mut classes, &mut track, 0);
     result
 }
 
@@ -381,7 +438,7 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
         // Ok, this is the chapter section.
 
         // This works amazingly well for showing how the html is structured.
-        // show_structure(chapter, 0);
+        show_structure(chapter, 0);
 
         println!("Elements: {:?}", element_walker(chapter));