|
@@ -1,7 +1,9 @@
|
|
|
-use anyhow::{bail, Result};
|
|
|
+use super::config;
|
|
|
+use anyhow::{bail, Context, Result};
|
|
|
use regex::Regex;
|
|
|
use scraper;
|
|
|
use scraper::Element;
|
|
|
+use std::collections::HashSet;
|
|
|
use std::string::String;
|
|
|
|
|
|
pub fn find_versions(html: &String) -> Result<String> {
|
|
@@ -165,8 +167,473 @@ pub struct BasicVerse {
|
|
|
pub text: String,
|
|
|
}
|
|
|
|
|
|
+fn parse_html_file(filename: &str) -> Result<scraper::Html> {
|
|
|
+ let buffer =
|
|
|
+ std::fs::read_to_string(filename).context(format!("Failed to read: {}", filename))?;
|
|
|
+ Ok(scraper::Html::parse_document(&buffer))
|
|
|
+}
|
|
|
+
|
|
|
+// This shows child elements correctly.
|
|
|
+// If I could build a structure of the chapter, maybe I could parse it?
|
|
|
+// I would at least know what to look for...
|
|
|
+
|
|
|
+fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
+ // For output formatting.
|
|
|
+ let spacer = " ".repeat(depth as usize * 4);
|
|
|
+
|
|
|
+ // This can be multiple classes, so watch out here.
|
|
|
+ let cls = element.attr("class").unwrap();
|
|
|
+ println!(
|
|
|
+ "{} {} E {} {} {:?}",
|
|
|
+ depth,
|
|
|
+ spacer,
|
|
|
+ element.value().name(),
|
|
|
+ cls,
|
|
|
+ element.value()
|
|
|
+ );
|
|
|
+
|
|
|
+ if element.has_children() {
|
|
|
+ // This always seem to think there's children elements. ?!?
|
|
|
+ // println!(" >>");
|
|
|
+ for child in element.child_elements() {
|
|
|
+ show_structure(child, depth + 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+pub enum VerseInformation {
|
|
|
+ ChapterVerse(String),
|
|
|
+ Content(String),
|
|
|
+ Note(String),
|
|
|
+}
|
|
|
+
|
|
|
+/// Clean element class, and return in a set.
|
|
|
+///
|
|
|
+/// Classes that have __ in them are returned without the __ and ...
|
|
|
+fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String> {
|
|
|
+ let mut result = HashSet::<String>::new();
|
|
|
+ if let Some(e_class) = element.attr("class") {
|
|
|
+ for c in e_class.split(" ") {
|
|
|
+ if let Some(chapter) = c.split_once("__") {
|
|
|
+ result.insert(chapter.0.to_string());
|
|
|
+ } else {
|
|
|
+ result.insert(c.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result
|
|
|
+}
|
|
|
+
|
|
|
+// This doesn't work because ft is a child of body.
|
|
|
+fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<VerseInformation>) {
|
|
|
+ let body_selector = scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
|
|
|
+ let mut text = String::new();
|
|
|
+
|
|
|
+ if let Some(body) = element.select(&body_selector).next() {
|
|
|
+ // for body in element.select(&body_selector).next() {
|
|
|
+ if !text.is_empty() {
|
|
|
+ text.push_str(" ");
|
|
|
+ }
|
|
|
+ text.push_str(element_text(body).as_str());
|
|
|
+ }
|
|
|
+ if !text.is_empty() {
|
|
|
+ results.push(VerseInformation::Note(text));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn add_append_note(results: &mut Vec<VerseInformation>, note:&str) {
|
|
|
+ if let Some(last) = results.last_mut() {
|
|
|
+ if let VerseInformation::Note(n) = last {
|
|
|
+ // Ok, the last thing is a "ChapterVerse".
|
|
|
+ n.push_str(" ");
|
|
|
+ n.push_str(note);
|
|
|
+ } else {
|
|
|
+ results.push(VerseInformation::Note(note.to_string()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn walker(
|
|
|
+ element: scraper::element_ref::ElementRef<'_>,
|
|
|
+ results: &mut Vec<VerseInformation>,
|
|
|
+ classes: &mut HashSet<String>,
|
|
|
+ depth: u32,
|
|
|
+) {
|
|
|
+ // For output formatting.
|
|
|
+ // let spacer = " ".repeat(depth as usize * 4);
|
|
|
+ let class_hash = clean_class(element);
|
|
|
+
|
|
|
+ if classes.contains("ChapterContent_note") {
|
|
|
+ // println!("note: {}", element.html());
|
|
|
+
|
|
|
+ // Ok, we're in the "note"
|
|
|
+ // Look for body or ft
|
|
|
+ if class_hash.contains("ChapterContent_body") {
|
|
|
+ // This the the body
|
|
|
+ let mut has_children = false;
|
|
|
+ for child in element.child_elements() {
|
|
|
+ has_children = true;
|
|
|
+ if let Some(cl) = child.attr("class") {
|
|
|
+ // label = "#"
|
|
|
+ // _fr__ = chapter_verse
|
|
|
+ if cl.contains("_label__") || cl.contains("_fr__") {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ add_append_note(results, &element_text(child));
|
|
|
+ // results.push(VerseInformation::Note(element_text(child)));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if !has_children {
|
|
|
+ let text = element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ // Check previous results?
|
|
|
+ add_append_note(results, &text);
|
|
|
+ // results.push(VerseInformation::Note(text));
|
|
|
+ }
|
|
|
+ // No children, we can return.
|
|
|
+
|
|
|
+ }
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // let mut ch_verse = String::new();
|
|
|
+ if class_hash.contains("ChapterContent_verse") {
|
|
|
+ if let Some(ch_v) = element.attr("data-usfm") {
|
|
|
+ // Check the last item.
|
|
|
+ if let Some(last) = results.last() {
|
|
|
+ if let VerseInformation::ChapterVerse(_) = last {
|
|
|
+ // Ok, the last thing is a "ChapterVerse". Remove it.
|
|
|
+ results.pop();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // We'll have to clean out the results, sometimes we get chapter and verse
|
|
|
+ // information (with a blank content).
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_content") {
|
|
|
+ // Ok! We have content.
|
|
|
+ // Should I check the classes here for:
|
|
|
+ // _p__ Paragraph?
|
|
|
+ // _q1__, _q2__ Quote?
|
|
|
+
|
|
|
+ let text = element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ results.push(VerseInformation::Content(text));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if element.has_children() {
|
|
|
+ // Add the classes to our class tracker.
|
|
|
+ for ch in class_hash.iter() {
|
|
|
+ classes.insert(ch.clone());
|
|
|
+ }
|
|
|
+
|
|
|
+ for child in element.child_elements() {
|
|
|
+ walker(child, results, classes, depth + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Remove the classes from the class tracker.
|
|
|
+ for ch in class_hash {
|
|
|
+ classes.remove(&ch);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInformation> {
|
|
|
+ let mut result = Vec::<VerseInformation>::new();
|
|
|
+ let mut classes = HashSet::<String>::new();
|
|
|
+ walker(element, &mut result, &mut classes, 0);
|
|
|
+ result
|
|
|
+}
|
|
|
+
|
|
|
+/// Extract just the Chapter's verses.
|
|
|
+///
|
|
|
+/// Returns Book, Chapter, and Verses
|
|
|
+pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVersesJSON)> {
|
|
|
+ let mut result = config::BasicVersesJSON::new();
|
|
|
+ let document = parse_html_file(filename)?;
|
|
|
+ let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
|
+ let h1 = document.select(&h1_selector).next().unwrap();
|
|
|
+ let mut book = element_text(h1);
|
|
|
+
|
|
|
+ // Remove chapter number from book.
|
|
|
+ while book.pop() != Some(' ') {
|
|
|
+ // Check for a problem.
|
|
|
+ if book.is_empty() {
|
|
|
+ bail!(format!(
|
|
|
+ "Failed to trim the chapter from [{}].",
|
|
|
+ element_text(h1)
|
|
|
+ ));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut chapter_number: u8 = 0;
|
|
|
+
|
|
|
+ // Locate the div that contains all of the chapter verses
|
|
|
+ let chapter_selector = scraper::Selector::parse(r#"div[class*="_chapter__"]"#).unwrap();
|
|
|
+ if let Some(chapter) = document.select(&chapter_selector).next() {
|
|
|
+ // Ok, this is the chapter section.
|
|
|
+
|
|
|
+ // This works amazingly well for showing how the html is structured.
|
|
|
+ // show_structure(chapter, 0);
|
|
|
+
|
|
|
+ println!("Elements: {:?}", element_walker(chapter));
|
|
|
+
|
|
|
+ println!("Chapter: {}", chapter.html());
|
|
|
+
|
|
|
+ // Look for _s1__ and _p__
|
|
|
+ // Missing _q1__ and _q2__ (quoted text?) It is left-indented.
|
|
|
+
|
|
|
+ let section_or_p_selector = scraper::Selector::parse(
|
|
|
+ r#"div[class*="_p__"], div[class*="_s1__"], div[class*="_q1__"], div[class*="_q2__"]"#,
|
|
|
+ )
|
|
|
+ .unwrap();
|
|
|
+ let mut section_header = String::new();
|
|
|
+ let mut start_paragraph = false;
|
|
|
+ let section_heading_selector =
|
|
|
+ scraper::Selector::parse(r#"span[class*="_heading__"]"#).unwrap();
|
|
|
+ let verse_selector = scraper::Selector::parse(r#"span[class*="_verse__"]"#).unwrap();
|
|
|
+
|
|
|
+ for section in chapter.select(§ion_or_p_selector) {
|
|
|
+ if let Some(cls) = section.attr("class") {
|
|
|
+ if cls.contains("_s1__") {
|
|
|
+ // Get section header
|
|
|
+ section_header.clear();
|
|
|
+ for sh in section.select(§ion_heading_selector) {
|
|
|
+ if !section_header.is_empty() {
|
|
|
+ section_header.push_str(" ");
|
|
|
+ }
|
|
|
+ section_header.push_str(&element_text(sh));
|
|
|
+ }
|
|
|
+ println!("Heading: {}", section_header);
|
|
|
+ /*
|
|
|
+ if let Some(section_heading) = section.select(§ion_heading_selector).next()
|
|
|
+ {
|
|
|
+ section_header = element_text(section_heading);
|
|
|
+ println!("Heading: {}", section_header);
|
|
|
+ }
|
|
|
+ */
|
|
|
+ } else if cls.contains("_p__") {
|
|
|
+ start_paragraph = true;
|
|
|
+ println!("¶ paragraph");
|
|
|
+
|
|
|
+ // Process verses here...
|
|
|
+
|
|
|
+ // We do get verses with blank content (from the previous paragraph).
|
|
|
+ // We need to handle that.
|
|
|
+
|
|
|
+ for verse in section.select(&verse_selector) {
|
|
|
+ if let Some(ch_ver) = verse.attr("data-usfm") {
|
|
|
+ println!(">> {}", ch_ver);
|
|
|
+
|
|
|
+ let mut output_chapter = false;
|
|
|
+
|
|
|
+ let parts = ch_ver.split(".").skip(1).collect::<Vec<_>>();
|
|
|
+ chapter_number = 0;
|
|
|
+ if parts.len() == 2 {
|
|
|
+ chapter_number = parts[0].parse()?;
|
|
|
+ }
|
|
|
+ let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
|
|
|
+
|
|
|
+ // println!("Book {} : {}", book, ch_ver);
|
|
|
+
|
|
|
+ // Find verses content
|
|
|
+ let verse_content_selector =
|
|
|
+ scraper::Selector::parse(r#"span[class*="Content_content__"]"#)
|
|
|
+ .unwrap();
|
|
|
+ for content in verse.select(&verse_content_selector) {
|
|
|
+ // Check for empty content -- and skip it.
|
|
|
+ let content_text = element_text(content);
|
|
|
+ if content_text.is_empty() {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ let verse = result.new_verse(verse_number as usize);
|
|
|
+
|
|
|
+ if start_paragraph {
|
|
|
+ verse.start_paragraph = true;
|
|
|
+ start_paragraph = false;
|
|
|
+ }
|
|
|
+ if !section_header.is_empty() {
|
|
|
+ verse.heading = Some(section_header);
|
|
|
+ section_header = String::new();
|
|
|
+ }
|
|
|
+
|
|
|
+ if !verse.verse.is_empty() {
|
|
|
+ verse.verse.push_str(" ");
|
|
|
+ }
|
|
|
+ verse.verse.push_str(&content_text);
|
|
|
+
|
|
|
+ if !output_chapter {
|
|
|
+ output_chapter = true;
|
|
|
+ println!("Book {} : {}", book, ch_ver);
|
|
|
+ }
|
|
|
+
|
|
|
+ println!(">> {}", content.html());
|
|
|
+ println!(" [{}]", content_text);
|
|
|
+ }
|
|
|
+ println!("-- next verse --");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ println!("end ¶ paragraph");
|
|
|
+ } else {
|
|
|
+ // _q1__, _q2__
|
|
|
+ // Each part of the verse is wrapped in the _q1__ or _q2__'s.
|
|
|
+
|
|
|
+ /*
|
|
|
+ BUG:
|
|
|
+ Galacians 2.13-14.
|
|
|
+ {
|
|
|
+ "start_paragraph": false,
|
|
|
+ "heading": null,
|
|
|
+ "verse": ""
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "start_paragraph": false,
|
|
|
+ "heading": null,
|
|
|
+ "verse": ""
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "start_paragraph": true,
|
|
|
+ "heading": "Justified by Faith",
|
|
|
+ "verse": "But if, in our endeavor to be justified in Christ, we too were found to be sinners, is Christ then a servant of sin? Certainly not!"
|
|
|
+ },
|
|
|
+
|
|
|
+ 2.13 and 2.14 aren't wrapped in anything. (no _p__, _q1__, or _q2__)
|
|
|
+
|
|
|
+ Amos 5.1
|
|
|
+ Hear this word that I take up over you in lamentation, O house of Israel:
|
|
|
+ {
|
|
|
+ "start_paragraph": true,
|
|
|
+ "heading": "Seek the",
|
|
|
+ "verse": "Hear this word that I take up over you in lamentation, O house of Israel:"
|
|
|
+ },
|
|
|
+
|
|
|
+ // Heading in multiple spans.
|
|
|
+ <div data-usfm="AMO.5" class="ChapterContent_chapter__uvbXo">
|
|
|
+ <div class="ChapterContent_label__R2PLt">5</div>
|
|
|
+ <div class="ChapterContent_s1__bNNaW">
|
|
|
+ <span class="ChapterContent_heading__xBDcs">Seek the </span>
|
|
|
+ <span class="ChapterContent_nd__ECPAf">
|
|
|
+ <span class="ChapterContent_heading__xBDcs">Lord</span>
|
|
|
+ </span>
|
|
|
+ <span class="ChapterContent_heading__xBDcs"> and Live</span>
|
|
|
+ </div>
|
|
|
+ <div class="ChapterContent_p__dVKHb">
|
|
|
+ <span class="ChapterContent_content__RrUqA"> </span>
|
|
|
+
|
|
|
+ Amos 5.4
|
|
|
+ {
|
|
|
+ "start_paragraph": false,
|
|
|
+ "heading": null,
|
|
|
+ "verse": "“Seek me and live;"
|
|
|
+ },
|
|
|
+
|
|
|
+ Missing non-quoted part.
|
|
|
+
|
|
|
+ */
|
|
|
+
|
|
|
+ // So, sections of a verse could be "Quoted"...
|
|
|
+ // Also, sections could be marked red letters.
|
|
|
+
|
|
|
+ // Pulling a verse:
|
|
|
+ // jq .book.Galatians.chapters[c].verses[v].verse
|
|
|
+
|
|
|
+ // I get duplicate outputs of Book and next verse for the same verse.
|
|
|
+ /*
|
|
|
+ >> GEN.1.27
|
|
|
+ Book Genesis : GEN.1.27
|
|
|
+ >> <span class="ChapterContent_content__RrUqA">So God created man in his own image,</span>
|
|
|
+ [So God created man in his own image,]
|
|
|
+ -- next verse --
|
|
|
+ >> GEN.1.27
|
|
|
+ Book Genesis : GEN.1.27
|
|
|
+ >> <span class="ChapterContent_content__RrUqA">in the image of God he created him;</span>
|
|
|
+ [in the image of God he created him;]
|
|
|
+ -- next verse --
|
|
|
+ >> GEN.1.27
|
|
|
+ Book Genesis : GEN.1.27
|
|
|
+ >> <span class="ChapterContent_content__RrUqA">male and female he created them.</span>
|
|
|
+ [male and female he created them.]
|
|
|
+ -- next verse --
|
|
|
+ ¶ paragraph
|
|
|
+ >> GEN.1.27
|
|
|
+ -- next verse --
|
|
|
+ */
|
|
|
+ for verse in section.select(&verse_selector) {
|
|
|
+ if let Some(ch_ver) = verse.attr("data-usfm") {
|
|
|
+ println!(">> {}", ch_ver);
|
|
|
+
|
|
|
+ let mut output_chapter = false;
|
|
|
+
|
|
|
+ let parts = ch_ver.split(".").skip(1).collect::<Vec<_>>();
|
|
|
+ chapter_number = 0;
|
|
|
+ if parts.len() == 2 {
|
|
|
+ chapter_number = parts[0].parse()?;
|
|
|
+ }
|
|
|
+ let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
|
|
|
+
|
|
|
+ // println!("Book {} : {}", book, ch_ver);
|
|
|
+
|
|
|
+ // Find verses content
|
|
|
+ let verse_content_selector =
|
|
|
+ scraper::Selector::parse(r#"span[class*="Content_content__"]"#)
|
|
|
+ .unwrap();
|
|
|
+ for content in verse.select(&verse_content_selector) {
|
|
|
+ // Check for empty content -- and skip it.
|
|
|
+ let content_text = element_text(content);
|
|
|
+ if content_text.is_empty() {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ let verse = result.new_verse(verse_number as usize);
|
|
|
+
|
|
|
+ if start_paragraph {
|
|
|
+ verse.start_paragraph = true;
|
|
|
+ start_paragraph = false;
|
|
|
+ }
|
|
|
+ if !section_header.is_empty() {
|
|
|
+ verse.heading = Some(section_header);
|
|
|
+ section_header = String::new();
|
|
|
+ }
|
|
|
+
|
|
|
+ if !verse.verse.is_empty() {
|
|
|
+ verse.verse.push_str(" ");
|
|
|
+ }
|
|
|
+ verse.verse.push_str(&content_text);
|
|
|
+
|
|
|
+ if !output_chapter {
|
|
|
+ output_chapter = true;
|
|
|
+ println!("Book {} : {}", book, ch_ver);
|
|
|
+ }
|
|
|
+
|
|
|
+ println!(">> {}", content.html());
|
|
|
+ println!(" [{}]", content_text);
|
|
|
+ }
|
|
|
+ println!("-- next verse --");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ bail!("Unable to locate the div tag with _chapter__.");
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok((book, chapter_number, result))
|
|
|
+}
|
|
|
+
|
|
|
+#[deprecated(note = "This is the old version, use extract_verses")]
|
|
|
pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
|
|
|
let mut result = Vec::<BasicVerse>::new();
|
|
|
+
|
|
|
let buffer = std::fs::read_to_string(filename)?;
|
|
|
let document = scraper::Html::parse_document(&buffer);
|
|
|
let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
@@ -200,7 +667,10 @@ pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
|
|
|
while book.pop() != Some(' ') {
|
|
|
// Check for a problem.
|
|
|
if book.is_empty() {
|
|
|
- bail!(format!("Failed to trim the chapter from [{}].", element_text(h1)));
|
|
|
+ bail!(format!(
|
|
|
+ "Failed to trim the chapter from [{}].",
|
|
|
+ element_text(h1)
|
|
|
+ ));
|
|
|
}
|
|
|
}
|
|
|
}
|