|
@@ -3,7 +3,7 @@ use anyhow::{bail, Context, Result};
|
|
|
use regex::Regex;
|
|
|
use scraper;
|
|
|
use scraper::Element;
|
|
|
-use std::collections::HashSet;
|
|
|
+use std::collections::{HashMap, HashSet};
|
|
|
use std::string::String;
|
|
|
|
|
|
pub fn find_versions(html: &String) -> Result<String> {
|
|
@@ -203,8 +203,14 @@ fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
pub enum VerseInformation {
|
|
|
+ Heading(String),
|
|
|
ChapterVerse(String),
|
|
|
- Content(String),
|
|
|
+ Content {
|
|
|
+ text: String,
|
|
|
+ quoted: bool,
|
|
|
+ paragraph: bool,
|
|
|
+ red: bool,
|
|
|
+ },
|
|
|
Note(String),
|
|
|
}
|
|
|
|
|
@@ -227,11 +233,12 @@ fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String>
|
|
|
|
|
|
// This doesn't work because ft is a child of body.
|
|
|
fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<VerseInformation>) {
|
|
|
- let body_selector = scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
|
|
|
+ let body_selector =
|
|
|
+ scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
|
|
|
let mut text = String::new();
|
|
|
|
|
|
if let Some(body) = element.select(&body_selector).next() {
|
|
|
- // for body in element.select(&body_selector).next() {
|
|
|
+ // for body in element.select(&body_selector).next() {
|
|
|
if !text.is_empty() {
|
|
|
text.push_str(" ");
|
|
|
}
|
|
@@ -242,7 +249,7 @@ fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<Ve
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
|
|
|
+fn add_append_note(results: &mut Vec<VerseInformation>, note: &str) {
|
|
|
if let Some(last) = results.last_mut() {
|
|
|
if let VerseInformation::Note(n) = last {
|
|
|
// Ok, the last thing is a "ChapterVerse".
|
|
@@ -252,12 +259,13 @@ fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
|
|
|
results.push(VerseInformation::Note(note.to_string()));
|
|
|
}
|
|
|
}
|
|
|
-}
|
|
|
+}
|
|
|
|
|
|
fn walker(
|
|
|
element: scraper::element_ref::ElementRef<'_>,
|
|
|
results: &mut Vec<VerseInformation>,
|
|
|
classes: &mut HashSet<String>,
|
|
|
+ track: &mut HashMap<String, String>,
|
|
|
depth: u32,
|
|
|
) {
|
|
|
// For output formatting.
|
|
@@ -293,7 +301,6 @@ fn walker(
|
|
|
// results.push(VerseInformation::Note(text));
|
|
|
}
|
|
|
// No children, we can return.
|
|
|
-
|
|
|
}
|
|
|
return;
|
|
|
}
|
|
@@ -302,14 +309,32 @@ fn walker(
|
|
|
// let mut ch_verse = String::new();
|
|
|
if class_hash.contains("ChapterContent_verse") {
|
|
|
if let Some(ch_v) = element.attr("data-usfm") {
|
|
|
+ // I'm getting duplicate ChapterVerse items in the results now.
|
|
|
// Check the last item.
|
|
|
+ let mut new_chv = false;
|
|
|
+
|
|
|
+ if track.contains_key("ch_v") {
|
|
|
+ if let Some(tchv) = track.get("ch_v") {
|
|
|
+ if tchv != ch_v {
|
|
|
+ new_chv = true;
|
|
|
+ track.insert("ch_v".to_string(), ch_v.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ new_chv = true;
|
|
|
+ track.insert("ch_v".to_string(), ch_v.to_string());
|
|
|
+ }
|
|
|
+
|
|
|
if let Some(last) = results.last() {
|
|
|
if let VerseInformation::ChapterVerse(_) = last {
|
|
|
// Ok, the last thing is a "ChapterVerse". Remove it.
|
|
|
results.pop();
|
|
|
}
|
|
|
}
|
|
|
- results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+
|
|
|
+ if new_chv {
|
|
|
+ results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -321,13 +346,41 @@ fn walker(
|
|
|
// Should I check the classes here for:
|
|
|
// _p__ Paragraph?
|
|
|
// _q1__, _q2__ Quote?
|
|
|
+ let quoted = classes.contains("ChapterContent_q1") || classes.contains("ChapterContent_q2");
|
|
|
+ let red = classes.contains("ChapterContent_wj")
|
|
|
+ || classes.contains("ChapterContent_wordsofchrist");
|
|
|
+ let text = element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+
|
|
|
+ // We have something to save. Is this start of paragraph?
|
|
|
+ let p = track.contains_key("p");
|
|
|
+ if p {
|
|
|
+ // Ok, we're storing it. Reset the paragraph flag.
|
|
|
+ track.remove("p");
|
|
|
+ }
|
|
|
+
|
|
|
+ results.push(VerseInformation::Content {
|
|
|
+ text,
|
|
|
+ quoted,
|
|
|
+ paragraph: p,
|
|
|
+ red,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
+ if class_hash.contains("ChapterContent_heading") {
|
|
|
let text = element_text(element);
|
|
|
if !text.is_empty() {
|
|
|
- results.push(VerseInformation::Content(text));
|
|
|
+ results.push(VerseInformation::Heading(text));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ if class_hash.contains("ChapterContent_p") {
|
|
|
+ track.insert("p".to_string(), "".to_string());
|
|
|
+ }
|
|
|
+
|
|
|
+ // Unfortunately, has_children always returns true...
|
|
|
+
|
|
|
if element.has_children() {
|
|
|
// Add the classes to our class tracker.
|
|
|
for ch in class_hash.iter() {
|
|
@@ -335,7 +388,7 @@ fn walker(
|
|
|
}
|
|
|
|
|
|
for child in element.child_elements() {
|
|
|
- walker(child, results, classes, depth + 1);
|
|
|
+ walker(child, results, classes, track, depth + 1);
|
|
|
}
|
|
|
|
|
|
// Remove the classes from the class tracker.
|
|
@@ -345,10 +398,14 @@ fn walker(
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+// TO FIX: Write this as a structure with impl method calls.
|
|
|
+// Eliminate the passing of state via the function calls.
|
|
|
+
|
|
|
fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInformation> {
|
|
|
let mut result = Vec::<VerseInformation>::new();
|
|
|
let mut classes = HashSet::<String>::new();
|
|
|
- walker(element, &mut result, &mut classes, 0);
|
|
|
+ let mut track = HashMap::<String, String>::new();
|
|
|
+ walker(element, &mut result, &mut classes, &mut track, 0);
|
|
|
result
|
|
|
}
|
|
|
|
|
@@ -381,7 +438,7 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
// Ok, this is the chapter section.
|
|
|
|
|
|
// This works amazingly well for showing how the html is structured.
|
|
|
- // show_structure(chapter, 0);
|
|
|
+ show_structure(chapter, 0);
|
|
|
|
|
|
println!("Elements: {:?}", element_walker(chapter));
|
|
|
|