|
@@ -1,7 +1,8 @@
|
|
|
use anyhow::{bail, Result};
|
|
|
+use regex::Regex;
|
|
|
use scraper;
|
|
|
use scraper::Element;
|
|
|
-use regex::Regex;
|
|
|
+use std::string::String;
|
|
|
|
|
|
pub fn find_versions(html: &String) -> Result<String> {
|
|
|
let document = scraper::Html::parse_document(&html);
|
|
@@ -11,7 +12,10 @@ pub fn find_versions(html: &String) -> Result<String> {
|
|
|
let text = element_text(a);
|
|
|
|
|
|
if version_match.is_match(&text) {
|
|
|
- return Ok(format!("Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}", text, text));
|
|
|
+ return Ok(format!(
|
|
|
+ "Mozilla/5.0 (X11; Linux x86_64; rv:{}) Gecko/20100101 Firefox/{}",
|
|
|
+ text, text
|
|
|
+ ));
|
|
|
}
|
|
|
}
|
|
|
bail!("Could not locate a version string.");
|
|
@@ -34,6 +38,7 @@ pub struct VerseOfDay {
|
|
|
pub reference: String,
|
|
|
}
|
|
|
|
|
|
+/// Extract element text, trimmed of whitespace.
|
|
|
fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
let text = element
|
|
|
.text()
|
|
@@ -43,6 +48,33 @@ fn element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
text
|
|
|
}
|
|
|
|
|
|
+/// Extract element verse text
|
|
|
+///
|
|
|
+/// This trims the elements, (translating " " to "\n").
|
|
|
+/// Joins with a single space.
|
|
|
+fn verse_element_text(element: scraper::ElementRef<'_>) -> String {
|
|
|
+ let span_class = scraper::Selector::parse("span[class]").unwrap();
|
|
|
+ let text: String = element
|
|
|
+ .select(&span_class)
|
|
|
+ .filter(|e| {
|
|
|
+ if let Some(c) = e.attr("class") {
|
|
|
+ return c.contains("content");
|
|
|
+ }
|
|
|
+ false
|
|
|
+ })
|
|
|
+ .map(|e| {
|
|
|
+ let text: String = e.text().collect::<String>();
|
|
|
+ if text == " " {
|
|
|
+ return String::from("\n");
|
|
|
+ } else {
|
|
|
+ return text.trim().to_string();
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect::<Vec<String>>()
|
|
|
+ .join(" ");
|
|
|
+ text
|
|
|
+}
|
|
|
+
|
|
|
pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
|
|
|
let document = scraper::Html::parse_document(&html);
|
|
|
// let a_selector = scraper::Selector::parse(r#"div>a[href^="/bible/"]"#).unwrap();
|
|
@@ -96,7 +128,7 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // Previous ones are in div[class="mlb-2"]
|
|
|
+ // Previous ones are in div[class="mlb-2"]
|
|
|
|
|
|
let prev_div_selector = scraper::Selector::parse(r#"div[class="mlb-2"]"#).unwrap();
|
|
|
let a_selector1 =
|
|
@@ -124,6 +156,68 @@ pub fn find_vod(html: &String) -> Result<Vec<VerseOfDay>> {
|
|
|
Ok(result)
|
|
|
}
|
|
|
|
|
|
+pub struct BasicVerse {
|
|
|
+ pub book: String,
|
|
|
+ pub chapter_verse: String,
|
|
|
+ pub verse: String,
|
|
|
+}
|
|
|
+
|
|
|
+pub fn extract_basic_verses(filename: &str) -> Result<Vec<BasicVerse>> {
|
|
|
+ let result = Vec::<BasicVerse>::new();
|
|
|
+ let buffer = std::fs::read_to_string(filename)?;
|
|
|
+ let document = scraper::Html::parse_document(&buffer);
|
|
|
+ let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
|
+ let h1 = document.select(&h1_selector).next().unwrap();
|
|
|
+ println!("Heading: {}", element_text(h1));
|
|
|
+
|
|
|
+ let span_data_usfm_selector = scraper::Selector::parse("span[data-usfm]").unwrap();
|
|
|
+ let span_class_selector = scraper::Selector::parse("span[class]").unwrap();
|
|
|
+ let _span_class_content_selector =
|
|
|
+ scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#).unwrap();
|
|
|
+
|
|
|
+ for span in document.select(&span_data_usfm_selector) {
|
|
|
+ // This will always be successful.
|
|
|
+ if let Some(data) = span.attr("data-usfm") {
|
|
|
+ // "GEN.1.2"
|
|
|
+ let _parts = data.split(".").collect::<Vec<_>>();
|
|
|
+ // GEN, 1, 2
|
|
|
+ // But, there's some books that don't have chapters. Beware!
|
|
|
+ let text_try = verse_element_text(span);
|
|
|
+ println!("trying: {:?}", text_try);
|
|
|
+
|
|
|
+ // There can be multiples of these with matching values.
|
|
|
+ let lines: String = span
|
|
|
+ .select(&span_class_selector)
|
|
|
+ .filter(|x| {
|
|
|
+ if let Some(c) = x.attr("class") {
|
|
|
+ if c.contains("content") {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ false
|
|
|
+ })
|
|
|
+ .map(|x| {
|
|
|
+ println!("x = {:?}", element_text(x));
|
|
|
+ let init = String::new();
|
|
|
+ let j = x.text().fold(init, |acc, x| {
|
|
|
+ let mut s = acc;
|
|
|
+ if x == " " {
|
|
|
+ s.push_str("\n");
|
|
|
+ } else {
|
|
|
+ s.push_str(x);
|
|
|
+ }
|
|
|
+ s
|
|
|
+ });
|
|
|
+ j
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ println!("data {} lines {:?}", data, lines);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(result)
|
|
|
+}
|
|
|
+
|
|
|
pub fn find_next_chapter(html: &String) -> Result<String> {
|
|
|
let document = scraper::Html::parse_document(html);
|
|
|
// let a_selector = scraper::Selector::parse("div>a").unwrap();
|