123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599 |
- use clap::{Parser, Subcommand};
- use reqwest;
- use scraper;
- use std::{
- collections::HashMap,
- fs::File,
- io::Write,
- path::{Path, PathBuf},
- string::String,
- sync::LazyLock,
- };
- use std::{thread, time::Duration};
- // Setup the command line options
- #[derive(Parser)]
- #[command(about, long_about=None)]
- struct Cli {
- /// Working directory
- #[arg(short, long, default_value = "bible")]
- work: PathBuf,
- /// Bible Version
- #[arg(short, long, default_value = "ESV")]
- version: String,
- #[command(subcommand)]
- command: Option<Commands>,
- }
- #[derive(Subcommand)]
- enum Commands {
- /// Fetch from the web, using work directory for cache
- Fetch {
- /// Delay
- #[arg(short, long, default_value = "10")]
- delay: u32,
- },
- /// Extract information from cached files
- Extract {
- /// Count
- #[arg(short, long, default_value = "5")]
- count: u32,
- /// All
- #[arg(short, long, action=clap::ArgAction::SetTrue)]
- all: bool,
- },
- /// Test something out
- Test {},
- }
- static APP_USER_AGENT: &str =
- "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
- static BASE_URL: &str = "https://www.bible.com";
- static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
- HashMap::from([
- ("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
- ("KJV", "https://www.bible.com/bible/1/GEN.1.KJV"),
- ("NIV", "https://www.bible.com/bible/111/GEN.INTRO1.NIV"),
- // https://www.bible.com/bible/111/GEN.1.NIV"),
- ("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
- ])
- });
- static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
- Vec::from([
- "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT", "1SA", "2SA", "1KI", "2KI", "1CH",
- "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER", "LAM", "EZK",
- "DAN", "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL",
- "MAT", "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", "COL", "1TH",
- "2TH", "1TI", "2TI", "TIT", "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD",
- "REV",
- ])
- });
- static BOOK_MAP: LazyLock<HashMap<&str, usize>> =
- LazyLock::new(|| {
- HashMap::from_iter(BOOKS.iter().enumerate().map(|x| (*x.1, x.0 + 1)))});
- // find_files in base_dir that end with extension bible version.
- fn find_files(base_dir: &str, version: &str) -> Vec<String> {
- let paths = std::fs::read_dir(base_dir).unwrap();
- let mut result = Vec::<String>::new();
- for path in paths {
- if let Ok(dir) = path {
- let filename = dir.file_name().to_string_lossy().to_string();
- if filename.ends_with(version) {
- result.push(filename);
- // result.push(dir.path().to_string_lossy().to_string());
- }
- }
- }
- let sorter_helper = |x:&String| -> (usize,i32) {
- let v : Vec<&str> = x.split(".").collect();
- let mut b:usize = 0;
- if BOOK_MAP.contains_key(v[0]) {
- b = BOOK_MAP[v[0]];
- }
- let c:i32 = v[1].parse().unwrap_or(0);
- (b,c)
- };
- // 1. Make it work. 2. Make it fast.
- // It would be nice to sort these (by book and chapter), so they are in order.
- // Should I just return file_names instead of path?
- result.sort_by(|a, b| {
- let a_v = sorter_helper(a);
- let b_v = sorter_helper(b);
- a_v.cmp(&b_v)
- });
- result
- }
- // Start here
- // static URL: &str = "https://www.bible.com/bible/59/PSA.95.ESV";
- // "https://www.bible.com/bible/59/GEN.1.ESV";
- // And maybe:
- // https://www.bible.com/bible/2692/GEN.1.NASB2020
- // https://www.bible.com/bible/1/GEN.1.KJV
- // https://www.bible.com/bible/2692/GEN.1.NASB2020
- // https://www.bible.com/bible/111/GEN.1.NIV
- // https://www.bible.com/bible/821/GEN.1.YLT98
- // Catholic
- // https://www.bible.com/bible/42/GEN.1.CPDV
- // Audio
- // https://www.bible.com/audio-bible/59/GEN.1.ESV
- // <script type="application/ld+json">{"@context":"https://schema.org","@type":"AudioObject","mainEntityOfPage":{"@type":"WebPage","@id":"https://www.bible.com/audio-bible/59/GEN.1.ESV"},"headline":"Audio Bible: Listen to Genesis 1 English Standard Version 2016 ESV","contentUrl":"https://audio-bible-cdn.youversionapi.com/1/32k/GEN/1-9dcefc68c6f7244489f59475fc7a1866.mp3?version_id=59",
- // https://www.bible.com/verse-of-the-day
- struct FetchResult {
- cached: bool,
- html: String,
- }
- fn fetch_cache(work_dir: &str, client: &reqwest::blocking::Client, url: &str) -> FetchResult {
- let (_, filename) = url.rsplit_once('/').unwrap();
- let path = Path::new(work_dir).join(filename);
- if path.exists() {
- // File already exists -- use cached version.
- let buffer = std::fs::read_to_string(path).unwrap();
- return FetchResult {
- cached: true,
- html: buffer,
- };
- }
- println!("fetch_cache {} => {}", url, filename);
- let res = client.get(url).send().unwrap();
- let buffer = res.text().unwrap();
- let mut file = File::create(path).unwrap();
- let _ = file.write_all(buffer.as_bytes());
- FetchResult {
- cached: false,
- html: buffer,
- }
- }
- fn main() {
- let cli = Cli::parse();
- // println!("Work Dir: {:?}", cli.work);
- // println!("Bible: {:?}", cli.bible);
- if !VERSION_URLS.contains_key(cli.version.as_str()) {
- println!("Sorry, I don't know about Bible Version [{}].", cli.version);
- println!("I do know about the following:");
- // Keys sorted in order.
- for (name, _) in VERSION_URLS.iter() {
- println!(" {}", name);
- }
- return;
- }
- match &cli.command {
- Some(Commands::Fetch { delay }) => {
- let client = reqwest::blocking::Client::builder()
- .user_agent(APP_USER_AGENT)
- .build()
- .unwrap();
- let mut url = VERSION_URLS[cli.version.as_str()].to_string();
- println!("Fetch! [{}] with delay {} secs.", cli.version, delay);
- let mut more = true;
- while more {
- let result = fetch_cache(cli.work.as_os_str().to_str().unwrap(), &client, url.as_str());
- more = false;
- let document = scraper::Html::parse_document(&result.html);
- // TO FIX
- // We want to upgrade this to use CSS selectors.
- // For now, us the "working" code we have.
- let a_selector = scraper::Selector::parse("div>a").unwrap();
- for a in document.select(&a_selector) {
- // Skip elements with a class attribute
- if a.attr("class").is_some() {
- continue;
- }
- if let Some(href) = a.attr("href") {
- if href.contains("/bible/") {
- let text = a.text().collect::<Vec<_>>();
- if text.len() != 1 {
- continue;
- }
- if text[0] != "Next Chapter" {
- continue;
- }
- // Ok! We've found the Next Chapter a element!
- if href.starts_with("/") {
- url = String::from(BASE_URL) + href;
- } else {
- url = href.to_string();
- }
- // println!("Found HREF: {} => {}", href, url);
- // panic!("Squirrel alert!");
- more = true;
- break;
- }
- }
- }
- if more {
- if !result.cached {
- thread::sleep(Duration::from_secs(*delay as u64));
- }
- }
- }
- println!("I'm finished fetching!");
- }
- Some(Commands::Extract { count, all }) => {
- println!("Extract...");
- let files = find_files(cli.work.to_str().unwrap(), cli.version.as_str());
- let filepath = Path::new(&cli.work);
- let mut chapters: HashMap<String, String> = HashMap::<String, String>::new();
- let mut extractor = |file| {
- println!("File: {}", file);
- /*
- let mut filepath = cli.work.clone();
- filepath = filepath.join(file);
- */
- let buffer = std::fs::read_to_string(filepath.join(file)).unwrap();
- let document = scraper::Html::parse_document(&buffer);
- let h1_selector = scraper::Selector::parse("h1").unwrap();
- let h1 = document.select(&h1_selector).next().unwrap();
- println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
- // https://programmersportal.com/the-complete-css-selectors-cheat-sheet-with-examples-and-pdf/
- // let span_selector = scraper::Selector::parse("span").unwrap();
- let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
- // parse r#"div>a[href ^="/bible/"]"#
- let span_class = scraper::Selector::parse("span[class]").unwrap();
- // span[class="ChapterContent_content__RrUqA"]
- // let span_class_content = scraper::Selector::parse(r#"span[class~="content"]"#).unwrap();
- // OK! ~= probably locates a matching attr line <span class="this that content"> but does not
- // match <span class="contains_content">!
- let _span_class_content =
- scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#)
- .unwrap();
- for span in document.select(&span_data_usfm) {
- // This will always be successful.
- if let Some(data) = span.attr("data-usfm") {
- // There can be multples of these with matching values.
- println!("data-usfm {}:", data);
- let lines: String = span
- .select(&span_class)
- // Only allow elements with attr class that containts "content"
- .filter(|x| {
- if let Some(c) = x.attr("class") {
- if c.contains("content") {
- return true;
- }
- }
- false
- })
- .map(|x| {
- // Convert element's text() iterator into a string.
- let init = String::new();
- let j = x.text().fold(init, |acc, x| {
- // print!( ">> {}<< ", x);
- let mut s = acc;
- if x == " " {
- // This would be a break/newline.
- s.push_str("\n");
- } else {
- s.push_str(x);
- }
- s
- });
- // println!("j = {}", j);
- j
- })
- .collect();
- println!("data {} lines {}", data, lines);
- if chapters.contains_key(data) {
- chapters.get_mut(data).unwrap().push_str(&lines);
- } else {
- chapters.insert(data.to_string(), lines);
- }
- }
- }
- };
- if *all {
- println!("Extract All:");
- for file in files.iter() {
- extractor(file);
- }
- } else {
- println!("Extract {}:", *count);
- for file in files.iter().take(*count as usize) {
- extractor(file);
- }
- }
- println!("Chapters: {:?}", chapters);
- /*
- "AMO.8.9": "“And on that day,” declares the Lord God,\n“I will make the sun go down at noonand darken the earth in broad daylight.\n"}
- ^ noonand ? Shouldn't that be "noon and"? Check original. Original has a break between them. Check merge routine.
- */
- /*
- // for file in files.iter().take(*count as usize) {
- for file in files_iter {
- /* ESV
- JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
- JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
- JHN.8.11 ["sin no more.”"]
- JHN.8.11 ["]]"] <- What is this? It is the the original HTML.
- JHN.8.11 [" "]
- */
- }
- */
- }
- Some(Commands::Test {}) => {
- println!("Testing...");
- let path = Path::new(&cli.work).join("GEN.1.NIV");
- let buffer = std::fs::read_to_string(path).unwrap();
- let document = scraper::Html::parse_document(&buffer);
- let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
- let _span_class = scraper::Selector::parse("span[class]").unwrap();
- let span_selector = scraper::Selector::parse("span").unwrap();
- for span in document.select(&span_data_usfm) {
- if let Some(data) = span.attr("data-usfm") {
- println!("data-usfm {}:", data);
- let mut lines = Vec::<&str>::new();
- for data_span in span.select(&span_selector) {
- if let Some(data_class) = data_span.attr("class") {
- if data_class.contains("content") {
- let mut text = data_span.text().collect::<Vec<_>>();
- println!("{} {:?}", data, text);
- lines.append(&mut text);
- }
- }
- }
- println!("data {} lines {:?}", data, lines);
- }
- }
- /*
- // Test finding div>a[href^="/bible/"]
- // let a_selector = scraper::Selector::parse("div>a").unwrap();
- let a_selector = scraper::Selector::parse(r#"div>a[href ^="/bible/"]"#).unwrap();
- // This reduces down the number of items to check (from ~40 to 4)!
- // And invalidates the check for /bible/ !
- for a in document.select(&a_selector) {
- let text = a.text().collect::<Vec<_>>();
- println!("text: {:?}", text);
- if let Some(href) = a.attr("href") {
- println!("href = {}", href);
- }
- println!("=====");
- }
- */
- }
- None => {
- println!("Looking for FETCH or EXTRACT");
- println!("I've got nothing to do here...");
- }
- }
- /*
- return;
- let client = reqwest::blocking::Client::builder()
- .user_agent(APP_USER_AGENT)
- .build()
- .unwrap();
- if true {
- let mut url = String::from(URL);
- let mut previous_url = url.clone();
- let mut working = true;
- while working {
- // Begin the fetching process...
- let result = fetch_cache(&client, url.as_str());
- working = false;
- // Ok, HTML, get parsing!
- let document = scraper::Html::parse_document(&result.html);
- // For now, I don't care about parsing, just crawling/saving.
- // Locate the a href.
- let a_selector = scraper::Selector::parse("div>a").unwrap();
- for a in document.select(&a_selector) {
- let c = a.attr("class");
- if c.is_some() {
- continue;
- }
- let href = a.attr("href");
- if href.is_some() {
- let href = href.unwrap();
- if href.contains("/bible/") {
- let text = a.text().collect::<Vec<_>>();
- if text.len() != 1 {
- continue;
- }
- if text[0] != "Next Chapter" {
- continue;
- }
- // previous_url = url;
- url = href.to_string();
- if url.starts_with("/") {
- url = String::from("https://www.bible.com") + &url;
- }
- working = true;
- break;
- }
- }
- }
- if working {
- if !result.cached {
- // Don't sleep if the results we just got were from the cache.
- thread::sleep(Duration::from_secs(10));
- }
- }
- }
- println!("I'm finished.");
- } else {
- // let res = client.get("https://httpbin.org/anything").send().unwrap();
- // println!("anything: {}", res.text().unwrap());
- let mut file = File::open(Path::new("fetch1.html")).unwrap();
- let mut buffer = String::new();
- let _ = file.read_to_string(&mut buffer);
- drop(file);
- /*
- let res = client.get(URL).send().unwrap();
- let buffer = res.text().unwrap();
- println!("{}", res.text().unwrap());
- */
- let document = scraper::Html::parse_document(&buffer);
- let h1_selector = scraper::Selector::parse("h1").unwrap();
- let h1 = document.select(&h1_selector).next().unwrap();
- println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
- // Selectors can't match on "if it exists" or
- // return a matching value, type fields.
- let span_selector = scraper::Selector::parse("span").unwrap();
- for span in document.select(&span_selector) {
- let data = span.attr("data-usfm");
- if data.is_some() {
- // Ok, iterate over these items...
- for data_span in span.select(&span_selector) {
- let d = data_span.attr("class");
- // println!("data_span: {:?}", data_span);
- if d.is_some() {
- let dt = d.unwrap();
- if dt.contains("content") {
- let text = data_span.text().collect::<Vec<_>>();
- println!("{} {:?}", data.unwrap(), text)
- }
- }
- }
- /*
- GEN.1.25 ["And God made the beasts of the earth according to their kinds and the livestock according to their kinds, and everything that creeps on the ground according to its kind. And God saw that it was good."]
- GEN.1.25 [" "]
- GEN.1.26 ["Then God said, "]
- GEN.1.26 ["“Let us make man"]
- GEN.1.26 [" in our image, "]
- GEN.1.26 ["after our likeness. And "]
- GEN.1.26 ["let them have dominion over the fish of the sea and over the birds of the heavens and over the livestock and over all the earth and over every creeping thing that creeps on the earth.”"]
- GEN.1.26 [" "]
- GEN.1.27 ["So God created man in his own image,"]
- GEN.1.27 ["in the image of God he created him;"]
- GEN.1.27 [" "]
- GEN.1.27 ["male and female he created them."]
- GEN.1.27 [" "]
- */
- /*
- let mut text = span.text().collect::<Vec<_>>();
- println!("span: {} {:?}", data.unwrap(), text);
- */
- }
- }
- }
- // let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
- */
- }
- /*
- Book/chapter: <h1>
- <h1>Genesis 1</h1>
- <div data-usfm="">, <span data-usfm="">
- <div data-usfm="GEN.1" class=
- "ChapterContent_chapter__uvbXo">
- <div class="ChapterContent_label__R2PLt">
- 1
- </div>
- <div class="ChapterContent_s1__bNNaW">
- <span class=
- "ChapterContent_heading__xBDcs">The
- Creation of the World</span>
- </div>
- <div class="ChapterContent_p__dVKHb">
- <span data-usfm="GEN.1.1" class=
- "ChapterContent_verse__57FIw"><span class="ChapterContent_label__R2PLt">
- 1</span><span class=
- "ChapterContent_content__RrUqA">In
- the</span> <span class=
- "ChapterContent_note__YlDW0 ChapterContent_x__tsTlk">
- <span class=
- "ChapterContent_label__R2PLt">#</span><span class="ChapterContent_body__O3qjr">Job
- 38:4-7; Ps. 33:6; 136:5; Isa. 42:5;
- 45:18; John 1:1-3; Acts 14:15; 17:24;
- Col. 1:16, 17; Heb. 1:10; 11:3; Rev.
- 4:11</span></span><span class=
- "ChapterContent_content__RrUqA">beginning,
- God created the heavens and the
- earth.</span></span>
- <span data-usfm=
- "GEN.1.2" class=
- "ChapterContent_verse__57FIw"><span class="ChapterContent_label__R2PLt">
- 2</span><span class=
- "ChapterContent_content__RrUqA">The
- earth was</span> <span class=
- "ChapterContent_note__YlDW0 ChapterContent_x__tsTlk">
- <span class=
- "ChapterContent_label__R2PLt">#</span><span class="ChapterContent_body__O3qjr">Jer.
- 4:23</span></span><span class=
- "ChapterContent_content__RrUqA">without
- form and void, and darkness was over
- the face of the deep. And the Spirit of
- God was hovering over the face of the
- waters.</span></span>
- Next page link:
- <div class="[pointer-events:all]">
- <a href="/bible/59/GEN.2.ESV">
- */
|