|
@@ -50,10 +50,12 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
text
|
|
|
}
|
|
|
|
|
|
+
|
|
|
/// Extract element verse text
|
|
|
///
|
|
|
/// This trims the elements, (translating " " to "\n").
|
|
|
/// Joins with a single space.
|
|
|
+#[allow(dead_code)]
|
|
|
fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
let span_class = scraper::Selector::parse("span[class]").unwrap();
|
|
|
let text: String = element
|
|
@@ -158,6 +160,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
|
|
|
Ok(result)
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
#[derive(Debug)]
|
|
|
pub struct BasicVerse {
|
|
|
pub book: String,
|
|
@@ -166,6 +169,7 @@ pub struct BasicVerse {
|
|
|
// pub chapter_verse: String,
|
|
|
pub text: String,
|
|
|
}
|
|
|
+*/
|
|
|
|
|
|
fn parse_html_file(filename: &str) -> Result<scraper::Html> {
|
|
|
let buffer =
|
|
@@ -177,18 +181,22 @@ fn parse_html_file(filename: &str) -> Result<scraper::Html> {
|
|
|
// If I could build a structure of the chapter, maybe I could parse it?
|
|
|
// I would at least know what to look for...
|
|
|
|
|
|
-fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
+/// Display the structure of the HTML
|
|
|
+///
|
|
|
+/// This shows a properly indented layout of the HTML tags.
|
|
|
+/// It shows what is nested in what, and what attributes the element
|
|
|
+/// has. (And it doesn't delete empty tags like html tidy does.)
|
|
|
+pub fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
// For output formatting.
|
|
|
let spacer = " ".repeat(depth as usize * 4);
|
|
|
|
|
|
// This can be multiple classes, so watch out here.
|
|
|
- let cls = element.attr("class").unwrap();
|
|
|
+ // let cls = element.attr("class").unwrap();
|
|
|
println!(
|
|
|
- "{} {} E {} {} {:?}",
|
|
|
+ "{} {} E {} {:?}",
|
|
|
depth,
|
|
|
spacer,
|
|
|
element.value().name(),
|
|
|
- cls,
|
|
|
element.value()
|
|
|
);
|
|
|
|
|
@@ -201,13 +209,23 @@ fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Verse information
|
|
|
#[derive(Debug)]
|
|
|
pub enum VerseInformation {
|
|
|
+ Heading(String),
|
|
|
+ /// Chapter and Verse "3.16"
|
|
|
ChapterVerse(String),
|
|
|
- Content(String),
|
|
|
+ Content {
|
|
|
+ text: String,
|
|
|
+ quoted: bool,
|
|
|
+ paragraph: bool,
|
|
|
+ red: bool,
|
|
|
+ },
|
|
|
+ /// Verse note
|
|
|
Note(String),
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
/// Clean element class, and return in a set.
|
|
|
///
|
|
|
/// Classes that have __ in them are returned without the __ and ...
|
|
@@ -227,11 +245,12 @@ fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String>
|
|
|
|
|
|
// This doesn't work because ft is a child of body.
|
|
|
fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<VerseInformation>) {
|
|
|
- let body_selector = scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
|
|
|
+ let body_selector =
|
|
|
+ scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
|
|
|
let mut text = String::new();
|
|
|
|
|
|
if let Some(body) = element.select(&body_selector).next() {
|
|
|
- // for body in element.select(&body_selector).next() {
|
|
|
+ // for body in element.select(&body_selector).next() {
|
|
|
if !text.is_empty() {
|
|
|
text.push_str(" ");
|
|
|
}
|
|
@@ -242,7 +261,7 @@ fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<Ve
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
|
|
|
+fn add_append_note(results: &mut Vec<VerseInformation>, note: &str) {
|
|
|
if let Some(last) = results.last_mut() {
|
|
|
if let VerseInformation::Note(n) = last {
|
|
|
// Ok, the last thing is a "ChapterVerse".
|
|
@@ -252,12 +271,13 @@ fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
|
|
|
results.push(VerseInformation::Note(note.to_string()));
|
|
|
}
|
|
|
}
|
|
|
-}
|
|
|
+}
|
|
|
|
|
|
fn walker(
|
|
|
element: scraper::element_ref::ElementRef<'_>,
|
|
|
results: &mut Vec<VerseInformation>,
|
|
|
classes: &mut HashSet<String>,
|
|
|
+ track: &mut HashMap<String, String>,
|
|
|
depth: u32,
|
|
|
) {
|
|
|
// For output formatting.
|
|
@@ -293,7 +313,6 @@ fn walker(
|
|
|
// results.push(VerseInformation::Note(text));
|
|
|
}
|
|
|
// No children, we can return.
|
|
|
-
|
|
|
}
|
|
|
return;
|
|
|
}
|
|
@@ -302,14 +321,32 @@ fn walker(
|
|
|
// let mut ch_verse = String::new();
|
|
|
if class_hash.contains("ChapterContent_verse") {
|
|
|
if let Some(ch_v) = element.attr("data-usfm") {
|
|
|
+ // I'm getting duplicate ChapterVerse items in the results now.
|
|
|
// Check the last item.
|
|
|
+ let mut new_chv = false;
|
|
|
+
|
|
|
+ if track.contains_key("ch_v") {
|
|
|
+ if let Some(tchv) = track.get("ch_v") {
|
|
|
+ if tchv != ch_v {
|
|
|
+ new_chv = true;
|
|
|
+ track.insert("ch_v".to_string(), ch_v.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ new_chv = true;
|
|
|
+ track.insert("ch_v".to_string(), ch_v.to_string());
|
|
|
+ }
|
|
|
+
|
|
|
if let Some(last) = results.last() {
|
|
|
if let VerseInformation::ChapterVerse(_) = last {
|
|
|
// Ok, the last thing is a "ChapterVerse". Remove it.
|
|
|
results.pop();
|
|
|
}
|
|
|
}
|
|
|
- results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+
|
|
|
+ if new_chv {
|
|
|
+ results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -321,13 +358,41 @@ fn walker(
|
|
|
// Should I check the classes here for:
|
|
|
// _p__ Paragraph?
|
|
|
// _q1__, _q2__ Quote?
|
|
|
+ let quoted = classes.contains("ChapterContent_q1") || classes.contains("ChapterContent_q2");
|
|
|
+ let red = classes.contains("ChapterContent_wj")
|
|
|
+ || classes.contains("ChapterContent_wordsofchrist");
|
|
|
+ let text = element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+
|
|
|
+ // We have something to save. Is this start of paragraph?
|
|
|
+ let p = track.contains_key("p");
|
|
|
+ if p {
|
|
|
+ // Ok, we're storing it. Reset the paragraph flag.
|
|
|
+ track.remove("p");
|
|
|
+ }
|
|
|
+
|
|
|
+ results.push(VerseInformation::Content {
|
|
|
+ text,
|
|
|
+ quoted,
|
|
|
+ paragraph: p,
|
|
|
+ red,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
+ if class_hash.contains("ChapterContent_heading") {
|
|
|
let text = element_text(element);
|
|
|
if !text.is_empty() {
|
|
|
- results.push(VerseInformation::Content(text));
|
|
|
+ results.push(VerseInformation::Heading(text));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ if class_hash.contains("ChapterContent_p") {
|
|
|
+ track.insert("p".to_string(), "".to_string());
|
|
|
+ }
|
|
|
+
|
|
|
+ // Unfortunately, has_children always returns true...
|
|
|
+
|
|
|
if element.has_children() {
|
|
|
// Add the classes to our class tracker.
|
|
|
for ch in class_hash.iter() {
|
|
@@ -335,7 +400,7 @@ fn walker(
|
|
|
}
|
|
|
|
|
|
for child in element.child_elements() {
|
|
|
- walker(child, results, classes, depth + 1);
|
|
|
+ walker(child, results, classes, track, depth + 1);
|
|
|
}
|
|
|
|
|
|
// Remove the classes from the class tracker.
|
|
@@ -345,12 +410,216 @@ fn walker(
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+// TO FIX: Write this as a structure with impl method calls.
|
|
|
+// Eliminate the passing of state via the function calls.
|
|
|
+
|
|
|
fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInformation> {
|
|
|
let mut result = Vec::<VerseInformation>::new();
|
|
|
let mut classes = HashSet::<String>::new();
|
|
|
- walker(element, &mut result, &mut classes, 0);
|
|
|
+ let mut track = HashMap::<String, String>::new();
|
|
|
+ walker(element, &mut result, &mut classes, &mut track, 0);
|
|
|
result
|
|
|
}
|
|
|
+*/
|
|
|
+
|
|
|
+pub struct WalkerParser {
|
|
|
+ results: Vec<VerseInformation>,
|
|
|
+ classes: HashSet<String>,
|
|
|
+ paragraph: bool,
|
|
|
+ chapter_verse: String,
|
|
|
+}
|
|
|
+
|
|
|
+impl WalkerParser {
|
|
|
+ pub fn new() -> Self {
|
|
|
+ Self {
|
|
|
+ results: Vec::<VerseInformation>::new(),
|
|
|
+ classes: HashSet::<String>::new(),
|
|
|
+ paragraph: false,
|
|
|
+ chapter_verse: String::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Reset the parser's internal state.
|
|
|
+ pub fn clear(&mut self) {
|
|
|
+ self.results.clear();
|
|
|
+ self.classes.clear();
|
|
|
+ self.paragraph = false;
|
|
|
+ self.chapter_verse.clear();
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Extract element text, trimmed of whitespace.
|
|
|
+ fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
+ let text = element
|
|
|
+ .text()
|
|
|
+ .map(|s| s.trim_matches(char::is_whitespace))
|
|
|
+ .filter(|x| !x.is_empty())
|
|
|
+ .collect::<String>();
|
|
|
+ text
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Clean element class, and return in a set.
|
|
|
+ ///
|
|
|
+ /// Classes that have __ in them are returned without the __ and ...
|
|
|
+ fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String> {
|
|
|
+ let mut result = HashSet::<String>::new();
|
|
|
+ if let Some(e_class) = element.attr("class") {
|
|
|
+ for c in e_class.split(" ") {
|
|
|
+ if let Some(chapter) = c.split_once("__") {
|
|
|
+ result.insert(chapter.0.to_string());
|
|
|
+ } else {
|
|
|
+ result.insert(c.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Add note
|
|
|
+ ///
|
|
|
+ /// This will append to a previous note, if the last item in result
|
|
|
+ /// is a VerseInformation::Note.
|
|
|
+ fn add_note(&mut self, note: &str) {
|
|
|
+ if let Some(last) = self.results.last_mut() {
|
|
|
+ if let VerseInformation::Note(n) = last {
|
|
|
+ n.push_str(" ");
|
|
|
+ n.push_str(note);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ self.results.push(VerseInformation::Note(note.to_string()));
|
|
|
+ }
|
|
|
+
|
|
|
+ fn add_content(&mut self, c: VerseInformation) {
|
|
|
+ if let VerseInformation::Content{text: ref c_text, quoted: c_q, paragraph: c_p, red: c_r} = c {
|
|
|
+ // I have the Content in a more usable form.
|
|
|
+ let mut insert = false;
|
|
|
+ if let Some(last) = self.results.last_mut() {
|
|
|
+ if let VerseInformation::Content { text: l_text, quoted: l_q, paragraph: l_p, red: l_r } = last {
|
|
|
+ if *l_q != c_q || *l_r != c_r {
|
|
|
+ insert = true;
|
|
|
+ }
|
|
|
+ if c_p {
|
|
|
+ insert = true;
|
|
|
+ }
|
|
|
+ // Tests are done.
|
|
|
+ if !insert {
|
|
|
+ l_text.push_str(" ");
|
|
|
+ l_text.push_str(&c_text);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ self.results.push(c);
|
|
|
+ } else {
|
|
|
+ panic!("Expected VerseInformation::Content not {:?}", c);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Recursively called to handle child elements.
|
|
|
+ ///
|
|
|
+ /// self.classes contains the parent's classes.
|
|
|
+ /// class_hash contains the current element's classes.
|
|
|
+ fn recursive_walker(&mut self, element: scraper::element_ref::ElementRef<'_>) {
|
|
|
+ let class_hash = Self::clean_class(element);
|
|
|
+ if self.classes.contains("ChapterContent_note") {
|
|
|
+ // We're in the note.
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_body") {
|
|
|
+ // Note body.
|
|
|
+ let mut has_children = false;
|
|
|
+ for child in element.child_elements() {
|
|
|
+ has_children = true;
|
|
|
+ if let Some(cl) = child.attr("class") {
|
|
|
+ if cl.contains("_label__") || cl.contains("_fr__") {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ let text = Self::element_text(child);
|
|
|
+ if !text.is_empty() {
|
|
|
+ self.add_note(&Self::element_text(child));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if !has_children {
|
|
|
+ let text = Self::element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ self.add_note(&text);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Since we've handled children elements here, we're done here.
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_verse") {
|
|
|
+ if let Some(ch_v) = element.attr("data-usfm") {
|
|
|
+ if self.chapter_verse != ch_v {
|
|
|
+ self.chapter_verse = ch_v.to_string();
|
|
|
+ self.results
|
|
|
+ .push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_content") {
|
|
|
+ // Content.
|
|
|
+ let quoted = self.classes.contains("ChapterContent_q1")
|
|
|
+ || self.classes.contains("ChapterContent_q2");
|
|
|
+ let red = self.classes.contains("ChapterContent_wj")
|
|
|
+ || self.classes.contains("ChapterContent_wordsofchrist");
|
|
|
+ let text = Self::element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ let paragraph = self.paragraph;
|
|
|
+ if paragraph {
|
|
|
+ self.paragraph = false;
|
|
|
+ }
|
|
|
+
|
|
|
+ self.add_content(
|
|
|
+ VerseInformation::Content {
|
|
|
+ text,
|
|
|
+ quoted,
|
|
|
+ paragraph,
|
|
|
+ red,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_heading") {
|
|
|
+ let text = Self::element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ self.results.push(VerseInformation::Heading(text));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_p") {
|
|
|
+ self.paragraph = true;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Unfortunately, has_children always returns true?
|
|
|
+
|
|
|
+ if element.has_children() {
|
|
|
+ // Add element classes to class tracker.
|
|
|
+ for element_class in class_hash.iter() {
|
|
|
+ self.classes.insert(element_class.clone());
|
|
|
+ }
|
|
|
+
|
|
|
+ for child in element.child_elements() {
|
|
|
+ self.recursive_walker(child);
|
|
|
+ }
|
|
|
+
|
|
|
+ for element_class in class_hash.iter() {
|
|
|
+ self.classes.remove(element_class);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Parse the element (and children) into VerseInformation.
|
|
|
+ pub fn parse(&mut self, element: scraper::element_ref::ElementRef<'_>) -> &[VerseInformation] {
|
|
|
+ self.clear();
|
|
|
+ self.recursive_walker(element);
|
|
|
+ self.results.as_slice()
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
/// Extract just the Chapter's verses.
|
|
|
///
|
|
@@ -374,6 +643,8 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
}
|
|
|
|
|
|
let mut chapter_number: u8 = 0;
|
|
|
+ let mut verse_number: u8 = 0;
|
|
|
+ let mut walker = WalkerParser::new();
|
|
|
|
|
|
// Locate the div that contains all of the chapter verses
|
|
|
let chapter_selector = scraper::Selector::parse(r#"div[class*="_chapter__"]"#).unwrap();
|
|
@@ -381,10 +652,67 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
// Ok, this is the chapter section.
|
|
|
|
|
|
// This works amazingly well for showing how the html is structured.
|
|
|
- // show_structure(chapter, 0);
|
|
|
+ show_structure(chapter, 0);
|
|
|
+ let results = walker.parse(chapter);
|
|
|
+
|
|
|
+ println!("Elements: {:?}", results);
|
|
|
|
|
|
- println!("Elements: {:?}", element_walker(chapter));
|
|
|
+ let mut heading = String::new();
|
|
|
|
|
|
+ for r in results {
|
|
|
+ match r {
|
|
|
+ VerseInformation::Heading(h) => {
|
|
|
+ heading = h.clone();
|
|
|
+ }
|
|
|
+ VerseInformation::ChapterVerse(cv) => {
|
|
|
+ let parts = cv.split(".").collect::<Vec<_>>();
|
|
|
+ chapter_number = parts[1].parse().unwrap();
|
|
|
+ verse_number = parts[2].parse().unwrap();
|
|
|
+
|
|
|
+ if !heading.is_empty() {
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Heading(heading.clone()));
|
|
|
+ heading.clear();
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ if result.verses.len() < verse_number as usize {
|
|
|
+ bail!(
|
|
|
+ "Len = {}, wanting {}",
|
|
|
+ result.verses.len() + 1,
|
|
|
+ verse_number
|
|
|
+ );
|
|
|
+ }
|
|
|
+ */
|
|
|
+ }
|
|
|
+ VerseInformation::Note(n) => {
|
|
|
+ if verse_number == 0 {
|
|
|
+ println!("DERP! verse_number is zero! Note: {}", n);
|
|
|
+ } else {
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Note(n.clone()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ VerseInformation::Content {
|
|
|
+ text,
|
|
|
+ quoted,
|
|
|
+ paragraph,
|
|
|
+ red,
|
|
|
+ } => {
|
|
|
+ if verse_number == 0 {
|
|
|
+ println!("DERP! verse_number is zero! Content: {}!", text);
|
|
|
+ } else {
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Verse {
|
|
|
+ text: text.to_string(),
|
|
|
+ paragraph: *paragraph,
|
|
|
+ quote: *quoted,
|
|
|
+ red: *red,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ /*
|
|
|
println!("Chapter: {}", chapter.html());
|
|
|
|
|
|
// Look for _s1__ and _p__
|
|
@@ -623,6 +951,7 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ */
|
|
|
} else {
|
|
|
bail!("Unable to locate the div tag with _chapter__.");
|
|
|
}
|
|
@@ -630,114 +959,6 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
Ok((book, chapter_number, result))
|
|
|
}
|
|
|
|
|
|
-#[deprecated(note = "This is the old version, use extract_verses")]
|
|
|
-pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
|
|
|
- let mut result = Vec::<BasicVerse>::new();
|
|
|
-
|
|
|
- let buffer = std::fs::read_to_string(filename)?;
|
|
|
- let document = scraper::Html::parse_document(&buffer);
|
|
|
- let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
|
- let h1 = document.select(&h1_selector).next().unwrap();
|
|
|
- let mut book = element_text(h1);
|
|
|
- // println!("Heading: {}", element_text(h1));
|
|
|
- let mut book_trim = true;
|
|
|
-
|
|
|
- let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
|
|
|
- let _span_class_selector = scraper::Selector::parse("span[class]").unwrap();
|
|
|
- let _span_class_content_selector =
|
|
|
- scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
|
|
|
-
|
|
|
- for span in document.select(&span_data_usfm_selector) {
|
|
|
- // This will always be successful.
|
|
|
- if let Some(data) = span.attr("data-usfm") {
|
|
|
- // "GEN.1.2"
|
|
|
- // let ch_ver = data.split(".").skip(1).collect::<String>();
|
|
|
-
|
|
|
- let parts = data.split(".").skip(1).collect::<Vec<_>>();
|
|
|
- let mut chapter_number: u8 = 0;
|
|
|
- if parts.len() == 2 {
|
|
|
- chapter_number = parts[0].parse()?;
|
|
|
- }
|
|
|
-
|
|
|
- if book_trim {
|
|
|
- // Only trim the book once.
|
|
|
- book_trim = false;
|
|
|
- if chapter_number != 0 {
|
|
|
- // Remove chapter number from book.
|
|
|
- while book.pop() != Some(' ') {
|
|
|
- // Check for a problem.
|
|
|
- if book.is_empty() {
|
|
|
- bail!(format!(
|
|
|
- "Failed to trim the chapter from [{}].",
|
|
|
- element_text(h1)
|
|
|
- ));
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
|
|
|
-
|
|
|
- // GEN, 1, 2
|
|
|
- // But, there's some books that don't have chapters. Beware!
|
|
|
- let text_try = verse_element_text(span);
|
|
|
- // This looks good. ;)
|
|
|
- // println!("{} text: {:?}", data, text_try);
|
|
|
-
|
|
|
- if let Some(b) = result.get_mut(verse_number as usize - 1) {
|
|
|
- // Yes, it already exists...
|
|
|
- // It seems like these should be joined with "\n" instead of " ".
|
|
|
-
|
|
|
- if !(*b).text.ends_with("\n") && !text_try.starts_with("\n") {
|
|
|
- (*b).text.push_str("\n");
|
|
|
- }
|
|
|
- b.text.push_str(text_try.as_str());
|
|
|
- } else {
|
|
|
- let bv = BasicVerse {
|
|
|
- book: book.clone(),
|
|
|
- chapter: chapter_number,
|
|
|
- verse: verse_number,
|
|
|
- text: text_try,
|
|
|
- };
|
|
|
-
|
|
|
- result.push(bv);
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- // There can be multiples of these with matching values.
|
|
|
- let lines: String = span
|
|
|
- .select(&span_class_selector)
|
|
|
- .filter(|x| {
|
|
|
- if let Some(c) = x.attr("class") {
|
|
|
- if c.contains("content") {
|
|
|
- return true;
|
|
|
- }
|
|
|
- }
|
|
|
- false
|
|
|
- })
|
|
|
- .map(|x| {
|
|
|
- println!("x = {:?}", element_text(x));
|
|
|
- let init = String::new();
|
|
|
- let j = x.text().fold(init, |acc, x| {
|
|
|
- let mut s = acc;
|
|
|
- if x == " " {
|
|
|
- s.push_str("\n");
|
|
|
- } else {
|
|
|
- s.push_str(x);
|
|
|
- }
|
|
|
- s
|
|
|
- });
|
|
|
- j
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- println!("data {} lines {:?}", data, lines);
|
|
|
- */
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(result)
|
|
|
-}
|
|
|
-
|
|
|
pub fn find_next_chapter(html: &String) -> Result<String> {
|
|
|
let document = scraper::Html::parse_document(html);
|
|
|
// let a_selector = scraper::Selector::parse("div>a").unwrap();
|