فهرست منبع

Working walker. Change to struct to clean it up.

We're passing around state in function calls...
Steve Thielemann 11 ماه پیش
والد
کامیت
9371e904d8
1فایلهای تغییر یافته به همراه69 افزوده شده و 12 حذف شده
  1. 69 12
      src/parse.rs

+ 69 - 12
src/parse.rs

@@ -3,7 +3,7 @@ use anyhow::{bail, Context, Result};
 use regex::Regex;
 use scraper;
 use scraper::Element;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::string::String;
 
 pub fn find_versions(html: &String) -> Result<String> {
@@ -203,8 +203,14 @@ fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
 
 #[derive(Debug)]
 pub enum VerseInformation {
+    Heading(String),
     ChapterVerse(String),
-    Content(String),
+    Content {
+        text: String,
+        quoted: bool,
+        paragraph: bool,
+        red: bool,
+    },
     Note(String),
 }
 
@@ -227,11 +233,12 @@ fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String>
 
 // This doesn't work because ft is a child of body.
 fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<VerseInformation>) {
-    let body_selector = scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
+    let body_selector =
+        scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
     let mut text = String::new();
 
     if let Some(body) = element.select(&body_selector).next() {
-    // for body in element.select(&body_selector).next() {
+        // for body in element.select(&body_selector).next() {
         if !text.is_empty() {
             text.push_str(" ");
         }
@@ -242,7 +249,7 @@ fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<Ve
     }
 }
 
-fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
+fn add_append_note(results: &mut Vec<VerseInformation>, note: &str) {
     if let Some(last) = results.last_mut() {
         if let VerseInformation::Note(n) = last {
             // Ok, the last thing is a "ChapterVerse".
@@ -252,12 +259,13 @@ fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
             results.push(VerseInformation::Note(note.to_string()));
         }
     }
-} 
+}
 
 fn walker(
     element: scraper::element_ref::ElementRef<'_>,
     results: &mut Vec<VerseInformation>,
     classes: &mut HashSet<String>,
+    track: &mut HashMap<String, String>,
     depth: u32,
 ) {
     // For output formatting.
@@ -293,7 +301,6 @@ fn walker(
                     // results.push(VerseInformation::Note(text));
                 }
                 // No children, we can return.
-                
             }
             return;
         }
@@ -302,14 +309,32 @@ fn walker(
     // let mut ch_verse = String::new();
     if class_hash.contains("ChapterContent_verse") {
         if let Some(ch_v) = element.attr("data-usfm") {
+            // I'm getting duplicate ChapterVerse items in the results now.
             // Check the last item.
+            let mut new_chv = false;
+
+            if track.contains_key("ch_v") {
+                if let Some(tchv) = track.get("ch_v") {
+                    if tchv != ch_v {
+                        new_chv = true;
+                        track.insert("ch_v".to_string(), ch_v.to_string());
+                    }
+                }
+            } else {
+                new_chv = true;
+                track.insert("ch_v".to_string(), ch_v.to_string());
+            }
+
             if let Some(last) = results.last() {
                 if let VerseInformation::ChapterVerse(_) = last {
                     // Ok, the last thing is a "ChapterVerse".  Remove it.
                     results.pop();
                 }
             }
-            results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
+
+            if new_chv {
+                results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
+            }
         }
     }
 
@@ -321,13 +346,41 @@ fn walker(
         // Should I check the classes here for:
         // _p__ Paragraph?
         // _q1__, _q2__ Quote?
+        let quoted = classes.contains("ChapterContent_q1") || classes.contains("ChapterContent_q2");
+        let red = classes.contains("ChapterContent_wj")
+            || classes.contains("ChapterContent_wordsofchrist");
+        let text = element_text(element);
+        if !text.is_empty() {
+
+            // We have something to save.  Is this start of paragraph?
+            let p = track.contains_key("p");
+            if p {
+                // Ok, we're storing it.  Reset the paragraph flag.
+                track.remove("p");
+            }
+
+            results.push(VerseInformation::Content {
+                text,
+                quoted,
+                paragraph: p,
+                red,
+            });
+        }
+    }
 
+    if class_hash.contains("ChapterContent_heading") {
         let text = element_text(element);
         if !text.is_empty() {
-            results.push(VerseInformation::Content(text));
+            results.push(VerseInformation::Heading(text));
         }
     }
 
+    if class_hash.contains("ChapterContent_p") {
+        track.insert("p".to_string(), "".to_string());
+    }
+
+    // Unfortunately, has_children always returns true...
+
     if element.has_children() {
         // Add the classes to our class tracker.
         for ch in class_hash.iter() {
@@ -335,7 +388,7 @@ fn walker(
         }
 
         for child in element.child_elements() {
-            walker(child, results, classes, depth + 1);
+            walker(child, results, classes, track, depth + 1);
         }
 
         // Remove the classes from the class tracker.
@@ -345,10 +398,14 @@ fn walker(
     }
 }
 
+// TO FIX:  Write this as a structure with impl method calls.
+// Eliminate the passing of state via the function calls.
+
 fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInformation> {
     let mut result = Vec::<VerseInformation>::new();
     let mut classes = HashSet::<String>::new();
-    walker(element, &mut result, &mut classes, 0);
+    let mut track = HashMap::<String, String>::new();
+    walker(element, &mut result, &mut classes, &mut track, 0);
     result
 }
 
@@ -381,7 +438,7 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
         // Ok, this is the chapter section.
 
         // This works amazingly well for showing how the html is structured.
-        // show_structure(chapter, 0);
+        show_structure(chapter, 0);
 
         println!("Elements: {:?}", element_walker(chapter));