|
@@ -1,16 +1,18 @@
|
|
|
+use anyhow::Result; // , Context};
|
|
|
use clap::{Parser, Subcommand};
|
|
|
use reqwest;
|
|
|
use scraper;
|
|
|
use std::{
|
|
|
collections::HashMap,
|
|
|
- fs::File,
|
|
|
- io::Write,
|
|
|
path::{Path, PathBuf},
|
|
|
string::String,
|
|
|
sync::LazyLock,
|
|
|
};
|
|
|
use std::{thread, time::Duration};
|
|
|
|
|
|
+mod fetch;
|
|
|
+mod parse;
|
|
|
+
|
|
|
// Setup the command line options
|
|
|
|
|
|
#[derive(Parser)]
|
|
@@ -60,6 +62,7 @@ static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
|
|
|
("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
|
|
|
("KJV", "https://www.bible.com/bible/1/GEN.1.KJV"),
|
|
|
("NIV", "https://www.bible.com/bible/111/GEN.INTRO1.NIV"),
|
|
|
+ ("NKJV", "https://www.bible.com/bible/114/GEN.1.NKJV"),
|
|
|
// https://www.bible.com/bible/111/GEN.1.NIV"),
|
|
|
("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
|
|
|
])
|
|
@@ -77,10 +80,9 @@ static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
|
|
|
});
|
|
|
|
|
|
static BOOK_MAP: LazyLock<HashMap<&str, usize>> =
|
|
|
- LazyLock::new(|| {
|
|
|
- HashMap::from_iter(BOOKS.iter().enumerate().map(|x| (*x.1, x.0 + 1)))});
|
|
|
+ LazyLock::new(|| HashMap::from_iter(BOOKS.iter().enumerate().map(|x| (*x.1, x.0 + 1))));
|
|
|
|
|
|
-// find_files in base_dir that end with extension bible version.
|
|
|
+/// find_files in base_dir that end with extension bible version.
|
|
|
fn find_files(base_dir: &str, version: &str) -> Vec<String> {
|
|
|
let paths = std::fs::read_dir(base_dir).unwrap();
|
|
|
let mut result = Vec::<String>::new();
|
|
@@ -95,14 +97,14 @@ fn find_files(base_dir: &str, version: &str) -> Vec<String> {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- let sorter_helper = |x:&String| -> (usize,i32) {
|
|
|
- let v : Vec<&str> = x.split(".").collect();
|
|
|
- let mut b:usize = 0;
|
|
|
+ let sorter_helper = |x: &String| -> (usize, i32) {
|
|
|
+ let v: Vec<&str> = x.split(".").collect();
|
|
|
+ let mut b: usize = 0;
|
|
|
if BOOK_MAP.contains_key(v[0]) {
|
|
|
b = BOOK_MAP[v[0]];
|
|
|
}
|
|
|
- let c:i32 = v[1].parse().unwrap_or(0);
|
|
|
- (b,c)
|
|
|
+ let c: i32 = v[1].parse().unwrap_or(0);
|
|
|
+ (b, c)
|
|
|
};
|
|
|
|
|
|
// 1. Make it work. 2. Make it fast.
|
|
@@ -136,37 +138,7 @@ fn find_files(base_dir: &str, version: &str) -> Vec<String> {
|
|
|
|
|
|
// https://www.bible.com/verse-of-the-day
|
|
|
|
|
|
-struct FetchResult {
|
|
|
- cached: bool,
|
|
|
- html: String,
|
|
|
-}
|
|
|
-
|
|
|
-fn fetch_cache(work_dir: &str, client: &reqwest::blocking::Client, url: &str) -> FetchResult {
|
|
|
- let (_, filename) = url.rsplit_once('/').unwrap();
|
|
|
- let path = Path::new(work_dir).join(filename);
|
|
|
-
|
|
|
- if path.exists() {
|
|
|
- // File already exists -- use cached version.
|
|
|
- let buffer = std::fs::read_to_string(path).unwrap();
|
|
|
- return FetchResult {
|
|
|
- cached: true,
|
|
|
- html: buffer,
|
|
|
- };
|
|
|
- }
|
|
|
-
|
|
|
- println!("fetch_cache {} => {}", url, filename);
|
|
|
- let res = client.get(url).send().unwrap();
|
|
|
- let buffer = res.text().unwrap();
|
|
|
-
|
|
|
- let mut file = File::create(path).unwrap();
|
|
|
- let _ = file.write_all(buffer.as_bytes());
|
|
|
- FetchResult {
|
|
|
- cached: false,
|
|
|
- html: buffer,
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-fn main() {
|
|
|
+fn main() -> Result<()> {
|
|
|
let cli = Cli::parse();
|
|
|
// println!("Work Dir: {:?}", cli.work);
|
|
|
// println!("Bible: {:?}", cli.bible);
|
|
@@ -179,23 +151,45 @@ fn main() {
|
|
|
for (name, _) in VERSION_URLS.iter() {
|
|
|
println!(" {}", name);
|
|
|
}
|
|
|
- return;
|
|
|
+ return Ok(());
|
|
|
}
|
|
|
|
|
|
match &cli.command {
|
|
|
Some(Commands::Fetch { delay }) => {
|
|
|
let client = reqwest::blocking::Client::builder()
|
|
|
.user_agent(APP_USER_AGENT)
|
|
|
- .build()
|
|
|
- .unwrap();
|
|
|
+ .build()?;
|
|
|
+ // .unwrap();
|
|
|
let mut url = VERSION_URLS[cli.version.as_str()].to_string();
|
|
|
println!("Fetch! [{}] with delay {} secs.", cli.version, delay);
|
|
|
let mut more = true;
|
|
|
|
|
|
while more {
|
|
|
- let result = fetch_cache(cli.work.as_os_str().to_str().unwrap(), &client, url.as_str());
|
|
|
- more = false;
|
|
|
+ let result = fetch::fetch_cache(
|
|
|
+ cli.work
|
|
|
+ .as_os_str()
|
|
|
+ .to_str()
|
|
|
+ .expect("Work should be valid."),
|
|
|
+ &client,
|
|
|
+ url.as_str(),
|
|
|
+ )?;
|
|
|
+
|
|
|
+ let next_chapter = parse::find_next_chapter(&result.html);
|
|
|
+
|
|
|
+ if let Ok(next_url) = next_chapter {
|
|
|
+ // Ok! We have something
|
|
|
+ // more = true;
|
|
|
+
|
|
|
+ if next_url.starts_with("/") {
|
|
|
+ url = String::from(BASE_URL) + &next_url;
|
|
|
+ } else {
|
|
|
+ url = next_url.to_string();
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ more = false;
|
|
|
+ }
|
|
|
|
|
|
+ /*
|
|
|
let document = scraper::Html::parse_document(&result.html);
|
|
|
|
|
|
// TO FIX
|
|
@@ -233,6 +227,7 @@ fn main() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ */
|
|
|
|
|
|
if more {
|
|
|
if !result.cached {
|
|
@@ -252,7 +247,7 @@ fn main() {
|
|
|
|
|
|
let mut extractor = |file| {
|
|
|
println!("File: {}", file);
|
|
|
- /*
|
|
|
+ /*
|
|
|
let mut filepath = cli.work.clone();
|
|
|
filepath = filepath.join(file);
|
|
|
*/
|
|
@@ -406,142 +401,144 @@ fn main() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- return;
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
|
|
|
+/*
|
|
|
+return;
|
|
|
|
|
|
- let client = reqwest::blocking::Client::builder()
|
|
|
- .user_agent(APP_USER_AGENT)
|
|
|
- .build()
|
|
|
- .unwrap();
|
|
|
|
|
|
- if true {
|
|
|
- let mut url = String::from(URL);
|
|
|
- let mut previous_url = url.clone();
|
|
|
- let mut working = true;
|
|
|
+let client = reqwest::blocking::Client::builder()
|
|
|
+ .user_agent(APP_USER_AGENT)
|
|
|
+ .build()
|
|
|
+ .unwrap();
|
|
|
|
|
|
- while working {
|
|
|
- // Begin the fetching process...
|
|
|
- let result = fetch_cache(&client, url.as_str());
|
|
|
+if true {
|
|
|
+ let mut url = String::from(URL);
|
|
|
+ let mut previous_url = url.clone();
|
|
|
+ let mut working = true;
|
|
|
|
|
|
- working = false;
|
|
|
+ while working {
|
|
|
+ // Begin the fetching process...
|
|
|
+ let result = fetch_cache(&client, url.as_str());
|
|
|
|
|
|
- // Ok, HTML, get parsing!
|
|
|
- let document = scraper::Html::parse_document(&result.html);
|
|
|
+ working = false;
|
|
|
|
|
|
- // For now, I don't care about parsing, just crawling/saving.
|
|
|
- // Locate the a href.
|
|
|
- let a_selector = scraper::Selector::parse("div>a").unwrap();
|
|
|
- for a in document.select(&a_selector) {
|
|
|
- let c = a.attr("class");
|
|
|
- if c.is_some() {
|
|
|
- continue;
|
|
|
- }
|
|
|
+ // Ok, HTML, get parsing!
|
|
|
+ let document = scraper::Html::parse_document(&result.html);
|
|
|
|
|
|
- let href = a.attr("href");
|
|
|
+ // For now, I don't care about parsing, just crawling/saving.
|
|
|
+ // Locate the a href.
|
|
|
+ let a_selector = scraper::Selector::parse("div>a").unwrap();
|
|
|
+ for a in document.select(&a_selector) {
|
|
|
+ let c = a.attr("class");
|
|
|
+ if c.is_some() {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- if href.is_some() {
|
|
|
- let href = href.unwrap();
|
|
|
- if href.contains("/bible/") {
|
|
|
- let text = a.text().collect::<Vec<_>>();
|
|
|
+ let href = a.attr("href");
|
|
|
|
|
|
- if text.len() != 1 {
|
|
|
- continue;
|
|
|
- }
|
|
|
- if text[0] != "Next Chapter" {
|
|
|
- continue;
|
|
|
- }
|
|
|
+ if href.is_some() {
|
|
|
+ let href = href.unwrap();
|
|
|
+ if href.contains("/bible/") {
|
|
|
+ let text = a.text().collect::<Vec<_>>();
|
|
|
|
|
|
- // previous_url = url;
|
|
|
- url = href.to_string();
|
|
|
- if url.starts_with("/") {
|
|
|
- url = String::from("https://www.bible.com") + &url;
|
|
|
- }
|
|
|
+ if text.len() != 1 {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if text[0] != "Next Chapter" {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- working = true;
|
|
|
- break;
|
|
|
+ // previous_url = url;
|
|
|
+ url = href.to_string();
|
|
|
+ if url.starts_with("/") {
|
|
|
+ url = String::from("https://www.bible.com") + &url;
|
|
|
}
|
|
|
+
|
|
|
+ working = true;
|
|
|
+ break;
|
|
|
}
|
|
|
}
|
|
|
- if working {
|
|
|
- if !result.cached {
|
|
|
- // Don't sleep if the results we just got were from the cache.
|
|
|
- thread::sleep(Duration::from_secs(10));
|
|
|
- }
|
|
|
+ }
|
|
|
+ if working {
|
|
|
+ if !result.cached {
|
|
|
+ // Don't sleep if the results we just got were from the cache.
|
|
|
+ thread::sleep(Duration::from_secs(10));
|
|
|
}
|
|
|
}
|
|
|
- println!("I'm finished.");
|
|
|
- } else {
|
|
|
- // let res = client.get("https://httpbin.org/anything").send().unwrap();
|
|
|
- // println!("anything: {}", res.text().unwrap());
|
|
|
-
|
|
|
- let mut file = File::open(Path::new("fetch1.html")).unwrap();
|
|
|
- let mut buffer = String::new();
|
|
|
- let _ = file.read_to_string(&mut buffer);
|
|
|
- drop(file);
|
|
|
-
|
|
|
- /*
|
|
|
- let res = client.get(URL).send().unwrap();
|
|
|
- let buffer = res.text().unwrap();
|
|
|
- println!("{}", res.text().unwrap());
|
|
|
- */
|
|
|
-
|
|
|
- let document = scraper::Html::parse_document(&buffer);
|
|
|
- let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
|
-
|
|
|
- let h1 = document.select(&h1_selector).next().unwrap();
|
|
|
- println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
|
|
|
-
|
|
|
- // Selectors can't match on "if it exists" or
|
|
|
- // return a matching value, type fields.
|
|
|
-
|
|
|
- let span_selector = scraper::Selector::parse("span").unwrap();
|
|
|
- for span in document.select(&span_selector) {
|
|
|
- let data = span.attr("data-usfm");
|
|
|
-
|
|
|
- if data.is_some() {
|
|
|
- // Ok, iterate over these items...
|
|
|
- for data_span in span.select(&span_selector) {
|
|
|
- let d = data_span.attr("class");
|
|
|
- // println!("data_span: {:?}", data_span);
|
|
|
- if d.is_some() {
|
|
|
- let dt = d.unwrap();
|
|
|
- if dt.contains("content") {
|
|
|
- let text = data_span.text().collect::<Vec<_>>();
|
|
|
- println!("{} {:?}", data.unwrap(), text)
|
|
|
- }
|
|
|
+ }
|
|
|
+ println!("I'm finished.");
|
|
|
+} else {
|
|
|
+ // let res = client.get("https://httpbin.org/anything").send().unwrap();
|
|
|
+ // println!("anything: {}", res.text().unwrap());
|
|
|
+
|
|
|
+ let mut file = File::open(Path::new("fetch1.html")).unwrap();
|
|
|
+ let mut buffer = String::new();
|
|
|
+ let _ = file.read_to_string(&mut buffer);
|
|
|
+ drop(file);
|
|
|
+
|
|
|
+ /*
|
|
|
+ let res = client.get(URL).send().unwrap();
|
|
|
+ let buffer = res.text().unwrap();
|
|
|
+ println!("{}", res.text().unwrap());
|
|
|
+ */
|
|
|
+
|
|
|
+ let document = scraper::Html::parse_document(&buffer);
|
|
|
+ let h1_selector = scraper::Selector::parse("h1").unwrap();
|
|
|
+
|
|
|
+ let h1 = document.select(&h1_selector).next().unwrap();
|
|
|
+ println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
|
|
|
+
|
|
|
+ // Selectors can't match on "if it exists" or
|
|
|
+ // return a matching value, type fields.
|
|
|
+
|
|
|
+ let span_selector = scraper::Selector::parse("span").unwrap();
|
|
|
+ for span in document.select(&span_selector) {
|
|
|
+ let data = span.attr("data-usfm");
|
|
|
+
|
|
|
+ if data.is_some() {
|
|
|
+ // Ok, iterate over these items...
|
|
|
+ for data_span in span.select(&span_selector) {
|
|
|
+ let d = data_span.attr("class");
|
|
|
+ // println!("data_span: {:?}", data_span);
|
|
|
+ if d.is_some() {
|
|
|
+ let dt = d.unwrap();
|
|
|
+ if dt.contains("content") {
|
|
|
+ let text = data_span.text().collect::<Vec<_>>();
|
|
|
+ println!("{} {:?}", data.unwrap(), text)
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
- /*
|
|
|
- GEN.1.25 ["And God made the beasts of the earth according to their kinds and the livestock according to their kinds, and everything that creeps on the ground according to its kind. And God saw that it was good."]
|
|
|
- GEN.1.25 [" "]
|
|
|
- GEN.1.26 ["Then God said, "]
|
|
|
- GEN.1.26 ["“Let us make man"]
|
|
|
- GEN.1.26 [" in our image, "]
|
|
|
- GEN.1.26 ["after our likeness. And "]
|
|
|
- GEN.1.26 ["let them have dominion over the fish of the sea and over the birds of the heavens and over the livestock and over all the earth and over every creeping thing that creeps on the earth.”"]
|
|
|
- GEN.1.26 [" "]
|
|
|
- GEN.1.27 ["So God created man in his own image,"]
|
|
|
- GEN.1.27 ["in the image of God he created him;"]
|
|
|
- GEN.1.27 [" "]
|
|
|
- GEN.1.27 ["male and female he created them."]
|
|
|
- GEN.1.27 [" "]
|
|
|
- */
|
|
|
+ /*
|
|
|
+ GEN.1.25 ["And God made the beasts of the earth according to their kinds and the livestock according to their kinds, and everything that creeps on the ground according to its kind. And God saw that it was good."]
|
|
|
+ GEN.1.25 [" "]
|
|
|
+ GEN.1.26 ["Then God said, "]
|
|
|
+ GEN.1.26 ["“Let us make man"]
|
|
|
+ GEN.1.26 [" in our image, "]
|
|
|
+ GEN.1.26 ["after our likeness. And "]
|
|
|
+ GEN.1.26 ["let them have dominion over the fish of the sea and over the birds of the heavens and over the livestock and over all the earth and over every creeping thing that creeps on the earth.”"]
|
|
|
+ GEN.1.26 [" "]
|
|
|
+ GEN.1.27 ["So God created man in his own image,"]
|
|
|
+ GEN.1.27 ["in the image of God he created him;"]
|
|
|
+ GEN.1.27 [" "]
|
|
|
+ GEN.1.27 ["male and female he created them."]
|
|
|
+ GEN.1.27 [" "]
|
|
|
+ */
|
|
|
|
|
|
- /*
|
|
|
- let mut text = span.text().collect::<Vec<_>>();
|
|
|
+ /*
|
|
|
+ let mut text = span.text().collect::<Vec<_>>();
|
|
|
|
|
|
- println!("span: {} {:?}", data.unwrap(), text);
|
|
|
- */
|
|
|
- }
|
|
|
+ println!("span: {} {:?}", data.unwrap(), text);
|
|
|
+ */
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- // let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
|
|
|
- */
|
|
|
}
|
|
|
|
|
|
+// let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
|
|
|
+*/
|
|
|
+
|
|
|
/*
|
|
|
Book/chapter: <h1>
|
|
|
<h1>Genesis 1</h1>
|