|
@@ -50,7 +50,6 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
text
|
|
|
}
|
|
|
|
|
|
-
|
|
|
/// Extract element verse text
|
|
|
///
|
|
|
/// This trims the elements, (translating " " to "\n").
|
|
@@ -160,17 +159,6 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
|
|
|
Ok(result)
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
-#[derive(Debug)]
|
|
|
-pub struct BasicVerse {
|
|
|
- pub book: String,
|
|
|
- pub chapter: u8,
|
|
|
- pub verse: u8,
|
|
|
- // pub chapter_verse: String,
|
|
|
- pub text: String,
|
|
|
-}
|
|
|
-*/
|
|
|
-
|
|
|
fn parse_html_file(filename: &str) -> Result<scraper::Html> {
|
|
|
let buffer =
|
|
|
std::fs::read_to_string(filename).context(format!("Failed to read: {}", filename))?;
|
|
@@ -225,203 +213,6 @@ pub enum VerseInformation {
|
|
|
Note(String),
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
-/// Clean element class, and return in a set.
|
|
|
-///
|
|
|
-/// Classes that have __ in them are returned without the __ and ...
|
|
|
-fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String> {
|
|
|
- let mut result = HashSet::<String>::new();
|
|
|
- if let Some(e_class) = element.attr("class") {
|
|
|
- for c in e_class.split(" ") {
|
|
|
- if let Some(chapter) = c.split_once("__") {
|
|
|
- result.insert(chapter.0.to_string());
|
|
|
- } else {
|
|
|
- result.insert(c.to_string());
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- result
|
|
|
-}
|
|
|
-
|
|
|
-// This doesn't work because ft is a child of body.
|
|
|
-fn walk_note(element: scraper::element_ref::ElementRef<'_>, results: &mut Vec<VerseInformation>) {
|
|
|
- let body_selector =
|
|
|
- scraper::Selector::parse(r#"span[class*="_body__"], span[class="ft"]"#).unwrap();
|
|
|
- let mut text = String::new();
|
|
|
-
|
|
|
- if let Some(body) = element.select(&body_selector).next() {
|
|
|
- // for body in element.select(&body_selector).next() {
|
|
|
- if !text.is_empty() {
|
|
|
- text.push_str(" ");
|
|
|
- }
|
|
|
- text.push_str(element_text(body).as_str());
|
|
|
- }
|
|
|
- if !text.is_empty() {
|
|
|
- results.push(VerseInformation::Note(text));
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-fn add_append_note(results: &mut Vec<VerseInformation>, note: &str) {
|
|
|
- if let Some(last) = results.last_mut() {
|
|
|
- if let VerseInformation::Note(n) = last {
|
|
|
- // Ok, the last thing is a "ChapterVerse".
|
|
|
- n.push_str(" ");
|
|
|
- n.push_str(note);
|
|
|
- } else {
|
|
|
- results.push(VerseInformation::Note(note.to_string()));
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-fn walker(
|
|
|
- element: scraper::element_ref::ElementRef<'_>,
|
|
|
- results: &mut Vec<VerseInformation>,
|
|
|
- classes: &mut HashSet<String>,
|
|
|
- track: &mut HashMap<String, String>,
|
|
|
- depth: u32,
|
|
|
-) {
|
|
|
- // For output formatting.
|
|
|
- // let spacer = " ".repeat(depth as usize * 4);
|
|
|
- let class_hash = clean_class(element);
|
|
|
-
|
|
|
- if classes.contains("ChapterContent_note") {
|
|
|
- // println!("note: {}", element.html());
|
|
|
-
|
|
|
- // Ok, we're in the "note"
|
|
|
- // Look for body or ft
|
|
|
- if class_hash.contains("ChapterContent_body") {
|
|
|
- // This the the body
|
|
|
- let mut has_children = false;
|
|
|
- for child in element.child_elements() {
|
|
|
- has_children = true;
|
|
|
- if let Some(cl) = child.attr("class") {
|
|
|
- // label = "#"
|
|
|
- // _fr__ = chapter_verse
|
|
|
- if cl.contains("_label__") || cl.contains("_fr__") {
|
|
|
- continue;
|
|
|
- }
|
|
|
- add_append_note(results, &element_text(child));
|
|
|
- // results.push(VerseInformation::Note(element_text(child)));
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if !has_children {
|
|
|
- let text = element_text(element);
|
|
|
- if !text.is_empty() {
|
|
|
- // Check previous results?
|
|
|
- add_append_note(results, &text);
|
|
|
- // results.push(VerseInformation::Note(text));
|
|
|
- }
|
|
|
- // No children, we can return.
|
|
|
- }
|
|
|
- return;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // let mut ch_verse = String::new();
|
|
|
- if class_hash.contains("ChapterContent_verse") {
|
|
|
- if let Some(ch_v) = element.attr("data-usfm") {
|
|
|
- // I'm getting duplicate ChapterVerse items in the results now.
|
|
|
- // Check the last item.
|
|
|
- let mut new_chv = false;
|
|
|
-
|
|
|
- if track.contains_key("ch_v") {
|
|
|
- if let Some(tchv) = track.get("ch_v") {
|
|
|
- if tchv != ch_v {
|
|
|
- new_chv = true;
|
|
|
- track.insert("ch_v".to_string(), ch_v.to_string());
|
|
|
- }
|
|
|
- }
|
|
|
- } else {
|
|
|
- new_chv = true;
|
|
|
- track.insert("ch_v".to_string(), ch_v.to_string());
|
|
|
- }
|
|
|
-
|
|
|
- if let Some(last) = results.last() {
|
|
|
- if let VerseInformation::ChapterVerse(_) = last {
|
|
|
- // Ok, the last thing is a "ChapterVerse". Remove it.
|
|
|
- results.pop();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if new_chv {
|
|
|
- results.push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // We'll have to clean out the results, sometimes we get chapter and verse
|
|
|
- // information (with a blank content).
|
|
|
-
|
|
|
- if class_hash.contains("ChapterContent_content") {
|
|
|
- // Ok! We have content.
|
|
|
- // Should I check the classes here for:
|
|
|
- // _p__ Paragraph?
|
|
|
- // _q1__, _q2__ Quote?
|
|
|
- let quoted = classes.contains("ChapterContent_q1") || classes.contains("ChapterContent_q2");
|
|
|
- let red = classes.contains("ChapterContent_wj")
|
|
|
- || classes.contains("ChapterContent_wordsofchrist");
|
|
|
- let text = element_text(element);
|
|
|
- if !text.is_empty() {
|
|
|
-
|
|
|
- // We have something to save. Is this start of paragraph?
|
|
|
- let p = track.contains_key("p");
|
|
|
- if p {
|
|
|
- // Ok, we're storing it. Reset the paragraph flag.
|
|
|
- track.remove("p");
|
|
|
- }
|
|
|
-
|
|
|
- results.push(VerseInformation::Content {
|
|
|
- text,
|
|
|
- quoted,
|
|
|
- paragraph: p,
|
|
|
- red,
|
|
|
- });
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if class_hash.contains("ChapterContent_heading") {
|
|
|
- let text = element_text(element);
|
|
|
- if !text.is_empty() {
|
|
|
- results.push(VerseInformation::Heading(text));
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if class_hash.contains("ChapterContent_p") {
|
|
|
- track.insert("p".to_string(), "".to_string());
|
|
|
- }
|
|
|
-
|
|
|
- // Unfortunately, has_children always returns true...
|
|
|
-
|
|
|
- if element.has_children() {
|
|
|
- // Add the classes to our class tracker.
|
|
|
- for ch in class_hash.iter() {
|
|
|
- classes.insert(ch.clone());
|
|
|
- }
|
|
|
-
|
|
|
- for child in element.child_elements() {
|
|
|
- walker(child, results, classes, track, depth + 1);
|
|
|
- }
|
|
|
-
|
|
|
- // Remove the classes from the class tracker.
|
|
|
- for ch in class_hash {
|
|
|
- classes.remove(&ch);
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-// TO FIX: Write this as a structure with impl method calls.
|
|
|
-// Eliminate the passing of state via the function calls.
|
|
|
-
|
|
|
-fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInformation> {
|
|
|
- let mut result = Vec::<VerseInformation>::new();
|
|
|
- let mut classes = HashSet::<String>::new();
|
|
|
- let mut track = HashMap::<String, String>::new();
|
|
|
- walker(element, &mut result, &mut classes, &mut track, 0);
|
|
|
- result
|
|
|
-}
|
|
|
-*/
|
|
|
-
|
|
|
pub struct WalkerParser {
|
|
|
results: Vec<VerseInformation>,
|
|
|
classes: HashSet<String>,
|
|
@@ -490,14 +281,26 @@ impl WalkerParser {
|
|
|
}
|
|
|
|
|
|
fn add_content(&mut self, c: VerseInformation) {
|
|
|
- if let VerseInformation::Content{text: ref c_text, quoted: c_q, paragraph: c_p, red: c_r} = c {
|
|
|
+ if let VerseInformation::Content {
|
|
|
+ text: ref c_text,
|
|
|
+ quoted: c_q,
|
|
|
+ paragraph: c_p,
|
|
|
+ red: c_r,
|
|
|
+ } = c
|
|
|
+ {
|
|
|
// I have the Content in a more usable form.
|
|
|
let mut insert = false;
|
|
|
if let Some(last) = self.results.last_mut() {
|
|
|
- if let VerseInformation::Content { text: l_text, quoted: l_q, paragraph: l_p, red: l_r } = last {
|
|
|
+ if let VerseInformation::Content {
|
|
|
+ text: l_text,
|
|
|
+ quoted: l_q,
|
|
|
+ paragraph: l_p,
|
|
|
+ red: l_r,
|
|
|
+ } = last
|
|
|
+ {
|
|
|
if *l_q != c_q || *l_r != c_r {
|
|
|
insert = true;
|
|
|
- }
|
|
|
+ }
|
|
|
if c_p {
|
|
|
insert = true;
|
|
|
}
|
|
@@ -574,8 +377,7 @@ impl WalkerParser {
|
|
|
self.paragraph = false;
|
|
|
}
|
|
|
|
|
|
- self.add_content(
|
|
|
- VerseInformation::Content {
|
|
|
+ self.add_content(VerseInformation::Content {
|
|
|
text,
|
|
|
quoted,
|
|
|
paragraph,
|
|
@@ -688,8 +490,8 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
if verse_number == 0 {
|
|
|
println!("DERP! verse_number is zero! Note: {}", n);
|
|
|
} else {
|
|
|
- let v = result.verse(verse_number as usize);
|
|
|
- v.push(config::BasicVerseJSON::Note(n.clone()));
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Note(n.clone()));
|
|
|
}
|
|
|
}
|
|
|
VerseInformation::Content {
|
|
@@ -712,246 +514,6 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- /*
|
|
|
- println!("Chapter: {}", chapter.html());
|
|
|
-
|
|
|
- // Look for _s1__ and _p__
|
|
|
- // Missing _q1__ and _q2__ (quoted text?) It is left-indented.
|
|
|
-
|
|
|
- let section_or_p_selector = scraper::Selector::parse(
|
|
|
- r#"div[class*="_p__"], div[class*="_s1__"], div[class*="_q1__"], div[class*="_q2__"]"#,
|
|
|
- )
|
|
|
- .unwrap();
|
|
|
- let mut section_header = String::new();
|
|
|
- let mut start_paragraph = false;
|
|
|
- let section_heading_selector =
|
|
|
- scraper::Selector::parse(r#"span[class*="_heading__"]"#).unwrap();
|
|
|
- let verse_selector = scraper::Selector::parse(r#"span[class*="_verse__"]"#).unwrap();
|
|
|
-
|
|
|
- for section in chapter.select(§ion_or_p_selector) {
|
|
|
- if let Some(cls) = section.attr("class") {
|
|
|
- if cls.contains("_s1__") {
|
|
|
- // Get section header
|
|
|
- section_header.clear();
|
|
|
- for sh in section.select(§ion_heading_selector) {
|
|
|
- if !section_header.is_empty() {
|
|
|
- section_header.push_str(" ");
|
|
|
- }
|
|
|
- section_header.push_str(&element_text(sh));
|
|
|
- }
|
|
|
- println!("Heading: {}", section_header);
|
|
|
- /*
|
|
|
- if let Some(section_heading) = section.select(§ion_heading_selector).next()
|
|
|
- {
|
|
|
- section_header = element_text(section_heading);
|
|
|
- println!("Heading: {}", section_header);
|
|
|
- }
|
|
|
- */
|
|
|
- } else if cls.contains("_p__") {
|
|
|
- start_paragraph = true;
|
|
|
- println!("¶ paragraph");
|
|
|
-
|
|
|
- // Process verses here...
|
|
|
-
|
|
|
- // We do get verses with blank content (from the previous paragraph).
|
|
|
- // We need to handle that.
|
|
|
-
|
|
|
- for verse in section.select(&verse_selector) {
|
|
|
- if let Some(ch_ver) = verse.attr("data-usfm") {
|
|
|
- println!(">> {}", ch_ver);
|
|
|
-
|
|
|
- let mut output_chapter = false;
|
|
|
-
|
|
|
- let parts = ch_ver.split(".").skip(1).collect::<Vec<_>>();
|
|
|
- chapter_number = 0;
|
|
|
- if parts.len() == 2 {
|
|
|
- chapter_number = parts[0].parse()?;
|
|
|
- }
|
|
|
- let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
|
|
|
-
|
|
|
- // println!("Book {} : {}", book, ch_ver);
|
|
|
-
|
|
|
- // Find verses content
|
|
|
- let verse_content_selector =
|
|
|
- scraper::Selector::parse(r#"span[class*="Content_content__"]"#)
|
|
|
- .unwrap();
|
|
|
- for content in verse.select(&verse_content_selector) {
|
|
|
- // Check for empty content -- and skip it.
|
|
|
- let content_text = element_text(content);
|
|
|
- if content_text.is_empty() {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- let verse = result.new_verse(verse_number as usize);
|
|
|
-
|
|
|
- if start_paragraph {
|
|
|
- verse.start_paragraph = true;
|
|
|
- start_paragraph = false;
|
|
|
- }
|
|
|
- if !section_header.is_empty() {
|
|
|
- verse.heading = Some(section_header);
|
|
|
- section_header = String::new();
|
|
|
- }
|
|
|
-
|
|
|
- if !verse.verse.is_empty() {
|
|
|
- verse.verse.push_str(" ");
|
|
|
- }
|
|
|
- verse.verse.push_str(&content_text);
|
|
|
-
|
|
|
- if !output_chapter {
|
|
|
- output_chapter = true;
|
|
|
- println!("Book {} : {}", book, ch_ver);
|
|
|
- }
|
|
|
-
|
|
|
- println!(">> {}", content.html());
|
|
|
- println!(" [{}]", content_text);
|
|
|
- }
|
|
|
- println!("-- next verse --");
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- println!("end ¶ paragraph");
|
|
|
- } else {
|
|
|
- // _q1__, _q2__
|
|
|
- // Each part of the verse is wrapped in the _q1__ or _q2__'s.
|
|
|
-
|
|
|
- /*
|
|
|
- BUG:
|
|
|
- Galacians 2.13-14.
|
|
|
- {
|
|
|
- "start_paragraph": false,
|
|
|
- "heading": null,
|
|
|
- "verse": ""
|
|
|
- },
|
|
|
- {
|
|
|
- "start_paragraph": false,
|
|
|
- "heading": null,
|
|
|
- "verse": ""
|
|
|
- },
|
|
|
- {
|
|
|
- "start_paragraph": true,
|
|
|
- "heading": "Justified by Faith",
|
|
|
- "verse": "But if, in our endeavor to be justified in Christ, we too were found to be sinners, is Christ then a servant of sin? Certainly not!"
|
|
|
- },
|
|
|
-
|
|
|
- 2.13 and 2.14 aren't wrapped in anything. (no _p__, _q1__, or _q2__)
|
|
|
-
|
|
|
- Amos 5.1
|
|
|
- Hear this word that I take up over you in lamentation, O house of Israel:
|
|
|
- {
|
|
|
- "start_paragraph": true,
|
|
|
- "heading": "Seek the",
|
|
|
- "verse": "Hear this word that I take up over you in lamentation, O house of Israel:"
|
|
|
- },
|
|
|
-
|
|
|
- // Heading in multiple spans.
|
|
|
- <div data-usfm="AMO.5" class="ChapterContent_chapter__uvbXo">
|
|
|
- <div class="ChapterContent_label__R2PLt">5</div>
|
|
|
- <div class="ChapterContent_s1__bNNaW">
|
|
|
- <span class="ChapterContent_heading__xBDcs">Seek the </span>
|
|
|
- <span class="ChapterContent_nd__ECPAf">
|
|
|
- <span class="ChapterContent_heading__xBDcs">Lord</span>
|
|
|
- </span>
|
|
|
- <span class="ChapterContent_heading__xBDcs"> and Live</span>
|
|
|
- </div>
|
|
|
- <div class="ChapterContent_p__dVKHb">
|
|
|
- <span class="ChapterContent_content__RrUqA"> </span>
|
|
|
-
|
|
|
- Amos 5.4
|
|
|
- {
|
|
|
- "start_paragraph": false,
|
|
|
- "heading": null,
|
|
|
- "verse": "“Seek me and live;"
|
|
|
- },
|
|
|
-
|
|
|
- Missing non-quoted part.
|
|
|
-
|
|
|
- */
|
|
|
-
|
|
|
- // So, sections of a verse could be "Quoted"...
|
|
|
- // Also, sections could be marked red letters.
|
|
|
-
|
|
|
- // Pulling a verse:
|
|
|
- // jq .book.Galatians.chapters[c].verses[v].verse
|
|
|
-
|
|
|
- // I get duplicate outputs of Book and next verse for the same verse.
|
|
|
- /*
|
|
|
- >> GEN.1.27
|
|
|
- Book Genesis : GEN.1.27
|
|
|
- >> <span class="ChapterContent_content__RrUqA">So God created man in his own image,</span>
|
|
|
- [So God created man in his own image,]
|
|
|
- -- next verse --
|
|
|
- >> GEN.1.27
|
|
|
- Book Genesis : GEN.1.27
|
|
|
- >> <span class="ChapterContent_content__RrUqA">in the image of God he created him;</span>
|
|
|
- [in the image of God he created him;]
|
|
|
- -- next verse --
|
|
|
- >> GEN.1.27
|
|
|
- Book Genesis : GEN.1.27
|
|
|
- >> <span class="ChapterContent_content__RrUqA">male and female he created them.</span>
|
|
|
- [male and female he created them.]
|
|
|
- -- next verse --
|
|
|
- ¶ paragraph
|
|
|
- >> GEN.1.27
|
|
|
- -- next verse --
|
|
|
- */
|
|
|
- for verse in section.select(&verse_selector) {
|
|
|
- if let Some(ch_ver) = verse.attr("data-usfm") {
|
|
|
- println!(">> {}", ch_ver);
|
|
|
-
|
|
|
- let mut output_chapter = false;
|
|
|
-
|
|
|
- let parts = ch_ver.split(".").skip(1).collect::<Vec<_>>();
|
|
|
- chapter_number = 0;
|
|
|
- if parts.len() == 2 {
|
|
|
- chapter_number = parts[0].parse()?;
|
|
|
- }
|
|
|
- let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
|
|
|
-
|
|
|
- // println!("Book {} : {}", book, ch_ver);
|
|
|
-
|
|
|
- // Find verses content
|
|
|
- let verse_content_selector =
|
|
|
- scraper::Selector::parse(r#"span[class*="Content_content__"]"#)
|
|
|
- .unwrap();
|
|
|
- for content in verse.select(&verse_content_selector) {
|
|
|
- // Check for empty content -- and skip it.
|
|
|
- let content_text = element_text(content);
|
|
|
- if content_text.is_empty() {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- let verse = result.new_verse(verse_number as usize);
|
|
|
-
|
|
|
- if start_paragraph {
|
|
|
- verse.start_paragraph = true;
|
|
|
- start_paragraph = false;
|
|
|
- }
|
|
|
- if !section_header.is_empty() {
|
|
|
- verse.heading = Some(section_header);
|
|
|
- section_header = String::new();
|
|
|
- }
|
|
|
-
|
|
|
- if !verse.verse.is_empty() {
|
|
|
- verse.verse.push_str(" ");
|
|
|
- }
|
|
|
- verse.verse.push_str(&content_text);
|
|
|
-
|
|
|
- if !output_chapter {
|
|
|
- output_chapter = true;
|
|
|
- println!("Book {} : {}", book, ch_ver);
|
|
|
- }
|
|
|
-
|
|
|
- println!(">> {}", content.html());
|
|
|
- println!(" [{}]", content_text);
|
|
|
- }
|
|
|
- println!("-- next verse --");
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- */
|
|
|
} else {
|
|
|
bail!("Unable to locate the div tag with _chapter__.");
|
|
|
}
|