|
@@ -3,7 +3,7 @@ use anyhow::{bail, Context, Result};
|
|
|
use regex::Regex;
|
|
|
use scraper;
|
|
|
use scraper::Element;
|
|
|
-use std::collections::{HashMap, HashSet};
|
|
|
+use std::collections::HashSet;
|
|
|
use std::string::String;
|
|
|
|
|
|
pub fn find_versions(html: &String) -> Result<String> {
|
|
@@ -50,10 +50,12 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
text
|
|
|
}
|
|
|
|
|
|
+
|
|
|
/// Extract element verse text
|
|
|
///
|
|
|
/// This trims the elements, (translating " " to "\n").
|
|
|
/// Joins with a single space.
|
|
|
+#[allow(dead_code)]
|
|
|
fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
let span_class = scraper::Selector::parse("span[class]").unwrap();
|
|
|
let text: String = element
|
|
@@ -158,6 +160,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
|
|
|
Ok(result)
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
#[derive(Debug)]
|
|
|
pub struct BasicVerse {
|
|
|
pub book: String,
|
|
@@ -166,6 +169,7 @@ pub struct BasicVerse {
|
|
|
// pub chapter_verse: String,
|
|
|
pub text: String,
|
|
|
}
|
|
|
+*/
|
|
|
|
|
|
fn parse_html_file(filename: &str) -> Result<scraper::Html> {
|
|
|
let buffer =
|
|
@@ -177,18 +181,22 @@ fn parse_html_file(filename: &str) -> Result<scraper::Html> {
|
|
|
// If I could build a structure of the chapter, maybe I could parse it?
|
|
|
// I would at least know what to look for...
|
|
|
|
|
|
-fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
+/// Display the structure of the HTML
|
|
|
+///
|
|
|
+/// This shows a properly indented layout of the HTML tags.
|
|
|
+/// It shows what is nested in what, and what attributes the element
|
|
|
+/// has. (And it doesn't delete empty tags like html tidy does.)
|
|
|
+pub fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
// For output formatting.
|
|
|
let spacer = " ".repeat(depth as usize * 4);
|
|
|
|
|
|
// This can be multiple classes, so watch out here.
|
|
|
- let cls = element.attr("class").unwrap();
|
|
|
+ // let cls = element.attr("class").unwrap();
|
|
|
println!(
|
|
|
- "{} {} E {} {} {:?}",
|
|
|
+ "{} {} E {} {:?}",
|
|
|
depth,
|
|
|
spacer,
|
|
|
element.value().name(),
|
|
|
- cls,
|
|
|
element.value()
|
|
|
);
|
|
|
|
|
@@ -201,9 +209,11 @@ fn show_structure(element: scraper::element_ref::ElementRef<'_>, depth: u32) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Verse information
|
|
|
#[derive(Debug)]
|
|
|
pub enum VerseInformation {
|
|
|
Heading(String),
|
|
|
+ /// Chapter and Verse "3.16"
|
|
|
ChapterVerse(String),
|
|
|
Content {
|
|
|
text: String,
|
|
@@ -211,9 +221,11 @@ pub enum VerseInformation {
|
|
|
paragraph: bool,
|
|
|
red: bool,
|
|
|
},
|
|
|
+ /// Verse note
|
|
|
Note(String),
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
/// Clean element class, and return in a set.
|
|
|
///
|
|
|
/// Classes that have __ in them are returned without the __ and ...
|
|
@@ -408,6 +420,206 @@ fn element_walker(element: scraper::element_ref::ElementRef<'_>) -> Vec<VerseInf
|
|
|
walker(element, &mut result, &mut classes, &mut track, 0);
|
|
|
result
|
|
|
}
|
|
|
+*/
|
|
|
+
|
|
|
+pub struct WalkerParser {
|
|
|
+ results: Vec<VerseInformation>,
|
|
|
+ classes: HashSet<String>,
|
|
|
+ paragraph: bool,
|
|
|
+ chapter_verse: String,
|
|
|
+}
|
|
|
+
|
|
|
+impl WalkerParser {
|
|
|
+ pub fn new() -> Self {
|
|
|
+ Self {
|
|
|
+ results: Vec::<VerseInformation>::new(),
|
|
|
+ classes: HashSet::<String>::new(),
|
|
|
+ paragraph: false,
|
|
|
+ chapter_verse: String::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Reset the parser's internal state.
|
|
|
+ pub fn clear(&mut self) {
|
|
|
+ self.results.clear();
|
|
|
+ self.classes.clear();
|
|
|
+ self.paragraph = false;
|
|
|
+ self.chapter_verse.clear();
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Extract element text, trimmed of whitespace.
|
|
|
+ fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
+ let text = element
|
|
|
+ .text()
|
|
|
+ .map(|s| s.trim_matches(char::is_whitespace))
|
|
|
+ .filter(|x| !x.is_empty())
|
|
|
+ .collect::<String>();
|
|
|
+ text
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Clean element class, and return in a set.
|
|
|
+ ///
|
|
|
+ /// Classes that have __ in them are returned without the __ and ...
|
|
|
+ fn clean_class(element: scraper::element_ref::ElementRef<'_>) -> HashSet<String> {
|
|
|
+ let mut result = HashSet::<String>::new();
|
|
|
+ if let Some(e_class) = element.attr("class") {
|
|
|
+ for c in e_class.split(" ") {
|
|
|
+ if let Some(chapter) = c.split_once("__") {
|
|
|
+ result.insert(chapter.0.to_string());
|
|
|
+ } else {
|
|
|
+ result.insert(c.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Add note
|
|
|
+ ///
|
|
|
+ /// This will append to a previous note, if the last item in result
|
|
|
+ /// is a VerseInformation::Note.
|
|
|
+ fn add_note(&mut self, note: &str) {
|
|
|
+ if let Some(last) = self.results.last_mut() {
|
|
|
+ if let VerseInformation::Note(n) = last {
|
|
|
+ n.push_str(" ");
|
|
|
+ n.push_str(note);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ self.results.push(VerseInformation::Note(note.to_string()));
|
|
|
+ }
|
|
|
+
|
|
|
+ fn add_content(&mut self, c: VerseInformation) {
|
|
|
+ if let VerseInformation::Content{text: ref c_text, quoted: c_q, paragraph: c_p, red: c_r} = c {
|
|
|
+ // I have the Content in a more usable form.
|
|
|
+ let mut insert = false;
|
|
|
+ if let Some(last) = self.results.last_mut() {
|
|
|
+ if let VerseInformation::Content { text: l_text, quoted: l_q, paragraph: l_p, red: l_r } = last {
|
|
|
+ if *l_q != c_q || *l_r != c_r {
|
|
|
+ insert = true;
|
|
|
+ }
|
|
|
+ if c_p {
|
|
|
+ insert = true;
|
|
|
+ }
|
|
|
+ // Tests are done.
|
|
|
+ if !insert {
|
|
|
+ l_text.push_str(" ");
|
|
|
+ l_text.push_str(&c_text);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ self.results.push(c);
|
|
|
+ } else {
|
|
|
+ panic!("Expected VerseInformation::Content not {:?}", c);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Recursively called to handle child elements.
|
|
|
+ ///
|
|
|
+ /// self.classes contains the parent's classes.
|
|
|
+ /// class_hash contains the current element's classes.
|
|
|
+ fn recursive_walker(&mut self, element: scraper::element_ref::ElementRef<'_>) {
|
|
|
+ let class_hash = Self::clean_class(element);
|
|
|
+ if self.classes.contains("ChapterContent_note") {
|
|
|
+ // We're in the note.
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_body") {
|
|
|
+ // Note body.
|
|
|
+ let mut has_children = false;
|
|
|
+ for child in element.child_elements() {
|
|
|
+ has_children = true;
|
|
|
+ if let Some(cl) = child.attr("class") {
|
|
|
+ if cl.contains("_label__") || cl.contains("_fr__") {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ let text = Self::element_text(child);
|
|
|
+ if !text.is_empty() {
|
|
|
+ self.add_note(&Self::element_text(child));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if !has_children {
|
|
|
+ let text = Self::element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ self.add_note(&text);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Since we've handled children elements here, we're done here.
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_verse") {
|
|
|
+ if let Some(ch_v) = element.attr("data-usfm") {
|
|
|
+ if self.chapter_verse != ch_v {
|
|
|
+ self.chapter_verse = ch_v.to_string();
|
|
|
+ self.results
|
|
|
+ .push(VerseInformation::ChapterVerse(ch_v.to_string()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_content") {
|
|
|
+ // Content.
|
|
|
+ let quoted = self.classes.contains("ChapterContent_q1")
|
|
|
+ || self.classes.contains("ChapterContent_q2");
|
|
|
+ let red = self.classes.contains("ChapterContent_wj")
|
|
|
+ || self.classes.contains("ChapterContent_wordsofchrist");
|
|
|
+ let text = Self::element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ let paragraph = self.paragraph;
|
|
|
+ if paragraph {
|
|
|
+ self.paragraph = false;
|
|
|
+ }
|
|
|
+
|
|
|
+ self.add_content(
|
|
|
+ VerseInformation::Content {
|
|
|
+ text,
|
|
|
+ quoted,
|
|
|
+ paragraph,
|
|
|
+ red,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_heading") {
|
|
|
+ let text = Self::element_text(element);
|
|
|
+ if !text.is_empty() {
|
|
|
+ self.results.push(VerseInformation::Heading(text));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class_hash.contains("ChapterContent_p") {
|
|
|
+ self.paragraph = true;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Unfortunately, has_children always returns true?
|
|
|
+
|
|
|
+ if element.has_children() {
|
|
|
+ // Add element classes to class tracker.
|
|
|
+ for element_class in class_hash.iter() {
|
|
|
+ self.classes.insert(element_class.clone());
|
|
|
+ }
|
|
|
+
|
|
|
+ for child in element.child_elements() {
|
|
|
+ self.recursive_walker(child);
|
|
|
+ }
|
|
|
+
|
|
|
+ for element_class in class_hash.iter() {
|
|
|
+ self.classes.remove(element_class);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Parse the element (and children) into VerseInformation.
|
|
|
+ pub fn parse(&mut self, element: scraper::element_ref::ElementRef<'_>) -> &[VerseInformation] {
|
|
|
+ self.clear();
|
|
|
+ self.recursive_walker(element);
|
|
|
+ self.results.as_slice()
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
/// Extract just the Chapter's verses.
|
|
|
///
|
|
@@ -431,6 +643,8 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
}
|
|
|
|
|
|
let mut chapter_number: u8 = 0;
|
|
|
+ let mut verse_number: u8 = 0;
|
|
|
+ let mut walker = WalkerParser::new();
|
|
|
|
|
|
// Locate the div that contains all of the chapter verses
|
|
|
let chapter_selector = scraper::Selector::parse(r#"div[class*="_chapter__"]"#).unwrap();
|
|
@@ -439,9 +653,66 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
|
|
|
// This works amazingly well for showing how the html is structured.
|
|
|
show_structure(chapter, 0);
|
|
|
+ let results = walker.parse(chapter);
|
|
|
+
|
|
|
+ println!("Elements: {:?}", results);
|
|
|
|
|
|
- println!("Elements: {:?}", element_walker(chapter));
|
|
|
+ let mut heading = String::new();
|
|
|
|
|
|
+ for r in results {
|
|
|
+ match r {
|
|
|
+ VerseInformation::Heading(h) => {
|
|
|
+ heading = h.clone();
|
|
|
+ }
|
|
|
+ VerseInformation::ChapterVerse(cv) => {
|
|
|
+ let parts = cv.split(".").collect::<Vec<_>>();
|
|
|
+ chapter_number = parts[1].parse().unwrap();
|
|
|
+ verse_number = parts[2].parse().unwrap();
|
|
|
+
|
|
|
+ if !heading.is_empty() {
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Heading(heading.clone()));
|
|
|
+ heading.clear();
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ if result.verses.len() < verse_number as usize {
|
|
|
+ bail!(
|
|
|
+ "Len = {}, wanting {}",
|
|
|
+ result.verses.len() + 1,
|
|
|
+ verse_number
|
|
|
+ );
|
|
|
+ }
|
|
|
+ */
|
|
|
+ }
|
|
|
+ VerseInformation::Note(n) => {
|
|
|
+ if verse_number == 0 {
|
|
|
+ println!("DERP! verse_number is zero! Note: {}", n);
|
|
|
+ } else {
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Note(n.clone()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ VerseInformation::Content {
|
|
|
+ text,
|
|
|
+ quoted,
|
|
|
+ paragraph,
|
|
|
+ red,
|
|
|
+ } => {
|
|
|
+ if verse_number == 0 {
|
|
|
+ println!("DERP! verse_number is zero! Content: {}!", text);
|
|
|
+ } else {
|
|
|
+ let v = result.verse(verse_number as usize);
|
|
|
+ v.push(config::BasicVerseJSON::Verse {
|
|
|
+ text: text.to_string(),
|
|
|
+ paragraph: *paragraph,
|
|
|
+ quote: *quoted,
|
|
|
+ red: *red,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ /*
|
|
|
println!("Chapter: {}", chapter.html());
|
|
|
|
|
|
// Look for _s1__ and _p__
|
|
@@ -680,6 +951,7 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ */
|
|
|
} else {
|
|
|
bail!("Unable to locate the div tag with _chapter__.");
|
|
|
}
|
|
@@ -687,114 +959,6 @@ pub fn extract_verses(filename: &str) -> Result<(String, u8, config::BasicVerses
|
|
|
Ok((book, chapter_number, result))
|
|
|
}
|
|
|
|
|
|
-#[deprecated(note = "This is the old version, use extract_verses")]
|
|
|
-pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
|
|
|
- let mut result = Vec::<BasicVerse>::new();
|
|
|
-
|
|
|
- let buffer = std::fs::read_to_string(filename)?;
|
|
|
- let document = scraper::Html::parse_document(&buffer);
|
|
|
- let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
|
- let h1 = document.select(&h1_selector).next().unwrap();
|
|
|
- let mut book = element_text(h1);
|
|
|
- // println!("Heading: {}", element_text(h1));
|
|
|
- let mut book_trim = true;
|
|
|
-
|
|
|
- let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
|
|
|
- let _span_class_selector = scraper::Selector::parse("span[class]").unwrap();
|
|
|
- let _span_class_content_selector =
|
|
|
- scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
|
|
|
-
|
|
|
- for span in document.select(&span_data_usfm_selector) {
|
|
|
- // This will always be successful.
|
|
|
- if let Some(data) = span.attr("data-usfm") {
|
|
|
- // "GEN.1.2"
|
|
|
- // let ch_ver = data.split(".").skip(1).collect::<String>();
|
|
|
-
|
|
|
- let parts = data.split(".").skip(1).collect::<Vec<_>>();
|
|
|
- let mut chapter_number: u8 = 0;
|
|
|
- if parts.len() == 2 {
|
|
|
- chapter_number = parts[0].parse()?;
|
|
|
- }
|
|
|
-
|
|
|
- if book_trim {
|
|
|
- // Only trim the book once.
|
|
|
- book_trim = false;
|
|
|
- if chapter_number != 0 {
|
|
|
- // Remove chapter number from book.
|
|
|
- while book.pop() != Some(' ') {
|
|
|
- // Check for a problem.
|
|
|
- if book.is_empty() {
|
|
|
- bail!(format!(
|
|
|
- "Failed to trim the chapter from [{}].",
|
|
|
- element_text(h1)
|
|
|
- ));
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- let verse_number: u8 = parts.last().unwrap_or(&"0").parse()?;
|
|
|
-
|
|
|
- // GEN, 1, 2
|
|
|
- // But, there's some books that don't have chapters. Beware!
|
|
|
- let text_try = verse_element_text(span);
|
|
|
- // This looks good. ;)
|
|
|
- // println!("{} text: {:?}", data, text_try);
|
|
|
-
|
|
|
- if let Some(b) = result.get_mut(verse_number as usize - 1) {
|
|
|
- // Yes, it already exists...
|
|
|
- // It seems like these should be joined with "\n" instead of " ".
|
|
|
-
|
|
|
- if !(*b).text.ends_with("\n") && !text_try.starts_with("\n") {
|
|
|
- (*b).text.push_str("\n");
|
|
|
- }
|
|
|
- b.text.push_str(text_try.as_str());
|
|
|
- } else {
|
|
|
- let bv = BasicVerse {
|
|
|
- book: book.clone(),
|
|
|
- chapter: chapter_number,
|
|
|
- verse: verse_number,
|
|
|
- text: text_try,
|
|
|
- };
|
|
|
-
|
|
|
- result.push(bv);
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- // There can be multiples of these with matching values.
|
|
|
- let lines: String = span
|
|
|
- .select(&span_class_selector)
|
|
|
- .filter(|x| {
|
|
|
- if let Some(c) = x.attr("class") {
|
|
|
- if c.contains("content") {
|
|
|
- return true;
|
|
|
- }
|
|
|
- }
|
|
|
- false
|
|
|
- })
|
|
|
- .map(|x| {
|
|
|
- println!("x = {:?}", element_text(x));
|
|
|
- let init = String::new();
|
|
|
- let j = x.text().fold(init, |acc, x| {
|
|
|
- let mut s = acc;
|
|
|
- if x == " " {
|
|
|
- s.push_str("\n");
|
|
|
- } else {
|
|
|
- s.push_str(x);
|
|
|
- }
|
|
|
- s
|
|
|
- });
|
|
|
- j
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- println!("data {} lines {:?}", data, lines);
|
|
|
- */
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(result)
|
|
|
-}
|
|
|
-
|
|
|
pub fn find_next_chapter(html: &String) -> Result<String> {
|
|
|
let document = scraper::Html::parse_document(html);
|
|
|
// let a_selector = scraper::Selector::parse("div>a").unwrap();
|