main.rs 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. use anyhow::Result; // , Context};
  2. use clap::{Parser, Subcommand};
  3. use reqwest;
  4. use scraper;
  5. use std::{
  6. collections::HashMap,
  7. path::{Path, PathBuf},
  8. string::String,
  9. sync::LazyLock,
  10. };
  11. use std::{thread, time::Duration};
  12. mod fetch;
  13. mod parse;
  14. // Setup the command line options
  15. #[derive(Parser)]
  16. #[command(about = "Downloads and parses Bible verses from https://www.bible.com", long_about=None, arg_required_else_help = true, after_help = "This is very specific to the website's HTML.\nIf it changes, this program might no longer work.")]
  17. struct Cli {
  18. /// Working directory
  19. #[arg(short, long, default_value = "bible")]
  20. work: PathBuf,
  21. /// Bible Version
  22. #[arg(short, long, default_value = "ESV")]
  23. version: String,
  24. #[command(subcommand)]
  25. command: Option<Commands>,
  26. }
  27. #[derive(Subcommand)]
  28. enum Commands {
  29. /// Fetch from the web, using work directory for cache
  30. Fetch {
  31. /// Delay
  32. #[arg(short, long, default_value = "10")]
  33. delay: u32,
  34. },
  35. /// Extract information from cached files
  36. Extract {
  37. /// Count
  38. #[arg(short, long, default_value = "5")]
  39. count: u32,
  40. /// All
  41. #[arg(short, long, action=clap::ArgAction::SetTrue)]
  42. all: bool,
  43. },
  44. /// Verse of the day
  45. Verse {
  46. /// Fetch new version
  47. #[arg(short, long, action=clap::ArgAction::SetTrue)]
  48. fetch: bool,
  49. },
  50. /// Test something out
  51. Test {},
  52. }
  53. static APP_USER_AGENT: &str =
  54. "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
  55. // Not needed, I process relative URLs correctly now.
  56. // static BASE_URL: &str = "https://www.bible.com";
  57. static VOD_URL: &str = "https://www.bible.com/verse-of-the-day";
  58. static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
  59. HashMap::from([
  60. ("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
  61. ("KJV", "https://www.bible.com/bible/1/GEN.1.KJV"),
  62. ("NIV", "https://www.bible.com/bible/111/GEN.INTRO1.NIV"),
  63. ("NKJV", "https://www.bible.com/bible/114/GEN.1.NKJV"),
  64. // https://www.bible.com/bible/111/GEN.1.NIV"),
  65. ("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
  66. ])
  67. });
  68. static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
  69. Vec::from([
  70. "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT", "1SA", "2SA", "1KI", "2KI", "1CH",
  71. "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER", "LAM", "EZK",
  72. "DAN", "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL",
  73. "MAT", "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", "COL", "1TH",
  74. "2TH", "1TI", "2TI", "TIT", "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD",
  75. "REV",
  76. ])
  77. });
  78. static BOOK_MAP: LazyLock<HashMap<&str, usize>> =
  79. LazyLock::new(|| HashMap::from_iter(BOOKS.iter().enumerate().map(|x| (*x.1, x.0 + 1))));
  80. /// find_files in base_dir that end with extension bible version.
  81. fn find_files(base_dir: &str, version: &str) -> Vec<String> {
  82. let paths = std::fs::read_dir(base_dir).unwrap();
  83. let mut result = Vec::<String>::new();
  84. for path in paths {
  85. if let Ok(dir) = path {
  86. let filename = dir.file_name().to_string_lossy().to_string();
  87. if filename.ends_with(version) {
  88. result.push(filename);
  89. // result.push(dir.path().to_string_lossy().to_string());
  90. }
  91. }
  92. }
  93. let sorter_helper = |x: &String| -> (usize, i32) {
  94. let v: Vec<&str> = x.split(".").collect();
  95. let mut b: usize = 0;
  96. if BOOK_MAP.contains_key(v[0]) {
  97. b = BOOK_MAP[v[0]];
  98. }
  99. let c: i32 = v[1].parse().unwrap_or(0);
  100. (b, c)
  101. };
  102. // 1. Make it work. 2. Make it fast.
  103. // It would be nice to sort these (by book and chapter), so they are in order.
  104. // Should I just return file_names instead of path?
  105. result.sort_by(|a, b| {
  106. let a_v = sorter_helper(a);
  107. let b_v = sorter_helper(b);
  108. a_v.cmp(&b_v)
  109. });
  110. result
  111. }
  112. // Start here
  113. // static URL: &str = "https://www.bible.com/bible/59/PSA.95.ESV";
  114. // "https://www.bible.com/bible/59/GEN.1.ESV";
  115. // And maybe:
  116. // https://www.bible.com/bible/2692/GEN.1.NASB2020
  117. // https://www.bible.com/bible/1/GEN.1.KJV
  118. // https://www.bible.com/bible/2692/GEN.1.NASB2020
  119. // https://www.bible.com/bible/111/GEN.1.NIV
  120. // https://www.bible.com/bible/821/GEN.1.YLT98
  121. // Catholic
  122. // https://www.bible.com/bible/42/GEN.1.CPDV
  123. // Audio
  124. // https://www.bible.com/audio-bible/59/GEN.1.ESV
  125. // <script type="application/ld+json">{"@context":"https://schema.org","@type":"AudioObject","mainEntityOfPage":{"@type":"WebPage","@id":"https://www.bible.com/audio-bible/59/GEN.1.ESV"},"headline":"Audio Bible: Listen to Genesis 1 English Standard Version 2016 ESV","contentUrl":"https://audio-bible-cdn.youversionapi.com/1/32k/GEN/1-9dcefc68c6f7244489f59475fc7a1866.mp3?version_id=59",
  126. // https://www.bible.com/verse-of-the-day
  127. fn main() -> Result<()> {
  128. let cli = Cli::parse();
  129. // println!("Work Dir: {:?}", cli.work);
  130. // println!("Bible: {:?}", cli.bible);
  131. if !VERSION_URLS.contains_key(cli.version.as_str()) {
  132. println!("Sorry, I don't know about Bible Version [{}].", cli.version);
  133. println!("I do know about the following:");
  134. // Keys sorted in order.
  135. for (name, _) in VERSION_URLS.iter() {
  136. println!(" {}", name);
  137. }
  138. return Ok(());
  139. }
  140. match &cli.command {
  141. Some(Commands::Fetch { delay }) => {
  142. let client = reqwest::blocking::Client::builder()
  143. .user_agent(APP_USER_AGENT)
  144. .build()?;
  145. // .unwrap();
  146. let mut url = VERSION_URLS[cli.version.as_str()].to_string();
  147. println!("Fetch! [{}] with delay {} secs.", cli.version, delay);
  148. let mut more = true;
  149. let mut cache_hit_once = true;
  150. while more {
  151. let result = fetch::fetch_cache(
  152. cli.work
  153. .as_os_str()
  154. .to_str()
  155. .expect("Work should be valid."),
  156. &client,
  157. url.as_str(),
  158. )?;
  159. let next_chapter = parse::find_next_chapter(&result.html, &url);
  160. if let Ok(next_url) = next_chapter {
  161. // Ok! We have something
  162. // more = true;
  163. /*
  164. if next_url.starts_with("/") {
  165. url = String::from(BASE_URL) + &next_url;
  166. } else {
  167. url = next_url.to_string();
  168. }
  169. */
  170. url = next_url;
  171. } else {
  172. // We didn't find the Next Chapter link, so stop.
  173. more = false;
  174. }
  175. // If there's more to do, add a delay between requests.
  176. if more {
  177. if !result.cached {
  178. thread::sleep(Duration::from_secs(*delay as u64));
  179. } else {
  180. if cache_hit_once {
  181. // Display this message only once.
  182. println!("Using CACHE.");
  183. cache_hit_once = false;
  184. }
  185. }
  186. }
  187. }
  188. println!("I'm finished fetching!");
  189. }
  190. Some(Commands::Extract { count, all }) => {
  191. println!("Extract...");
  192. let files = find_files(cli.work.to_str().unwrap(), cli.version.as_str());
  193. let filepath = Path::new(&cli.work);
  194. let mut chapters: HashMap<String, String> = HashMap::<String, String>::new();
  195. let mut extractor = |file| {
  196. println!("File: {}", file);
  197. /*
  198. let mut filepath = cli.work.clone();
  199. filepath = filepath.join(file);
  200. */
  201. let buffer = std::fs::read_to_string(filepath.join(file)).unwrap();
  202. let document = scraper::Html::parse_document(&buffer);
  203. let h1_selector = scraper::Selector::parse("h1").unwrap();
  204. let h1 = document.select(&h1_selector).next().unwrap();
  205. println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
  206. // https://programmersportal.com/the-complete-css-selectors-cheat-sheet-with-examples-and-pdf/
  207. // let span_selector = scraper::Selector::parse("span").unwrap();
  208. let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
  209. // parse r#"div>a[href ^="/bible/"]"#
  210. let span_class = scraper::Selector::parse("span[class]").unwrap();
  211. // span[class="ChapterContent_content__RrUqA"]
  212. // let span_class_content = scraper::Selector::parse(r#"span[class~="content"]"#).unwrap();
  213. // OK! ~= probably locates a matching attr line <span class="this that content"> but does not
  214. // match <span class="contains_content">!
  215. let _span_class_content =
  216. scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#)
  217. .unwrap();
  218. for span in document.select(&span_data_usfm) {
  219. // This will always be successful.
  220. if let Some(data) = span.attr("data-usfm") {
  221. // There can be multples of these with matching values.
  222. println!("data-usfm {}:", data);
  223. let lines: String = span
  224. .select(&span_class)
  225. // Only allow elements with attr class that containts "content"
  226. .filter(|x| {
  227. if let Some(c) = x.attr("class") {
  228. if c.contains("content") {
  229. return true;
  230. }
  231. }
  232. false
  233. })
  234. .map(|x| {
  235. // Convert element's text() iterator into a string.
  236. let init = String::new();
  237. let j = x.text().fold(init, |acc, x| {
  238. // print!( ">> {}<< ", x);
  239. let mut s = acc;
  240. if x == " " {
  241. // This would be a break/newline.
  242. s.push_str("\n");
  243. } else {
  244. s.push_str(x);
  245. }
  246. s
  247. });
  248. // println!("j = {}", j);
  249. j
  250. })
  251. .collect();
  252. println!("data {} lines {}", data, lines);
  253. if chapters.contains_key(data) {
  254. chapters.get_mut(data).unwrap().push_str(&lines);
  255. } else {
  256. chapters.insert(data.to_string(), lines);
  257. }
  258. }
  259. }
  260. };
  261. if *all {
  262. println!("Extract All:");
  263. for file in files.iter() {
  264. extractor(file);
  265. }
  266. } else {
  267. println!("Extract {}:", *count);
  268. for file in files.iter().take(*count as usize) {
  269. extractor(file);
  270. }
  271. }
  272. println!("Chapters: {:?}", chapters);
  273. /*
  274. "AMO.8.9": "“And on that day,” declares the Lord God,\n“I will make the sun go down at noonand darken the earth in broad daylight.\n"}
  275. ^ noonand ? Shouldn't that be "noon and"? Check original. Original has a break between them. Check merge routine.
  276. */
  277. /* ESV
  278. JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
  279. JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
  280. JHN.8.11 ["sin no more.”"]
  281. JHN.8.11 ["]]"] <- What is this? It is the the original HTML.
  282. JHN.8.11 [" "]
  283. */
  284. }
  285. Some(Commands::Verse { fetch: _ }) => {
  286. let client = reqwest::blocking::Client::builder()
  287. .user_agent(APP_USER_AGENT)
  288. .build()?;
  289. println!("Verse of the day.");
  290. let result = fetch::fetch_cache(
  291. cli.work
  292. .as_os_str()
  293. .to_str()
  294. .expect("Work should be valid."),
  295. &client,
  296. VOD_URL,
  297. )?;
  298. if result.cached {
  299. println!("(from cache):");
  300. }
  301. for v in parse::find_vod(&result.html)? {
  302. println!("Date: {}", v.date);
  303. println!("Verse: {}", v.verse);
  304. println!("Ref: {}", v.reference);
  305. println!("------");
  306. };
  307. }
  308. Some(Commands::Test {}) => {
  309. println!("Testing...");
  310. let path = Path::new(&cli.work).join("GEN.1.NIV");
  311. let buffer = std::fs::read_to_string(path).unwrap();
  312. let document = scraper::Html::parse_document(&buffer);
  313. let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
  314. let _span_class = scraper::Selector::parse("span[class]").unwrap();
  315. let span_selector = scraper::Selector::parse("span").unwrap();
  316. for span in document.select(&span_data_usfm) {
  317. if let Some(data) = span.attr("data-usfm") {
  318. println!("data-usfm {}:", data);
  319. let mut lines = Vec::<&str>::new();
  320. for data_span in span.select(&span_selector) {
  321. if let Some(data_class) = data_span.attr("class") {
  322. if data_class.contains("content") {
  323. let mut text = data_span.text().collect::<Vec<_>>();
  324. println!("{} {:?}", data, text);
  325. lines.append(&mut text);
  326. }
  327. }
  328. }
  329. println!("data {} lines {:?}", data, lines);
  330. }
  331. }
  332. /*
  333. // Test finding div>a[href^="/bible/"]
  334. // let a_selector = scraper::Selector::parse("div>a").unwrap();
  335. let a_selector = scraper::Selector::parse(r#"div>a[href ^="/bible/"]"#).unwrap();
  336. // This reduces down the number of items to check (from ~40 to 4)!
  337. // And invalidates the check for /bible/ !
  338. for a in document.select(&a_selector) {
  339. let text = a.text().collect::<Vec<_>>();
  340. println!("text: {:?}", text);
  341. if let Some(href) = a.attr("href") {
  342. println!("href = {}", href);
  343. }
  344. println!("=====");
  345. }
  346. */
  347. }
  348. None => {
  349. println!("I didn't see a command. Displaying help.\n");
  350. let _show_help: Cli = Cli::parse_from(["--help"]);
  351. }
  352. }
  353. Ok(())
  354. }
  355. /*
  356. return;
  357. let client = reqwest::blocking::Client::builder()
  358. .user_agent(APP_USER_AGENT)
  359. .build()
  360. .unwrap();
  361. if true {
  362. let mut url = String::from(URL);
  363. let mut previous_url = url.clone();
  364. let mut working = true;
  365. while working {
  366. // Begin the fetching process...
  367. let result = fetch_cache(&client, url.as_str());
  368. working = false;
  369. // Ok, HTML, get parsing!
  370. let document = scraper::Html::parse_document(&result.html);
  371. // For now, I don't care about parsing, just crawling/saving.
  372. // Locate the a href.
  373. let a_selector = scraper::Selector::parse("div>a").unwrap();
  374. for a in document.select(&a_selector) {
  375. let c = a.attr("class");
  376. if c.is_some() {
  377. continue;
  378. }
  379. let href = a.attr("href");
  380. if href.is_some() {
  381. let href = href.unwrap();
  382. if href.contains("/bible/") {
  383. let text = a.text().collect::<Vec<_>>();
  384. if text.len() != 1 {
  385. continue;
  386. }
  387. if text[0] != "Next Chapter" {
  388. continue;
  389. }
  390. // previous_url = url;
  391. url = href.to_string();
  392. if url.starts_with("/") {
  393. url = String::from("https://www.bible.com") + &url;
  394. }
  395. working = true;
  396. break;
  397. }
  398. }
  399. }
  400. if working {
  401. if !result.cached {
  402. // Don't sleep if the results we just got were from the cache.
  403. thread::sleep(Duration::from_secs(10));
  404. }
  405. }
  406. }
  407. println!("I'm finished.");
  408. } else {
  409. // let res = client.get("https://httpbin.org/anything").send().unwrap();
  410. // println!("anything: {}", res.text().unwrap());
  411. let mut file = File::open(Path::new("fetch1.html")).unwrap();
  412. let mut buffer = String::new();
  413. let _ = file.read_to_string(&mut buffer);
  414. drop(file);
  415. /*
  416. let res = client.get(URL).send().unwrap();
  417. let buffer = res.text().unwrap();
  418. println!("{}", res.text().unwrap());
  419. */
  420. let document = scraper::Html::parse_document(&buffer);
  421. let h1_selector = scraper::Selector::parse("h1").unwrap();
  422. let h1 = document.select(&h1_selector).next().unwrap();
  423. println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
  424. // Selectors can't match on "if it exists" or
  425. // return a matching value, type fields.
  426. let span_selector = scraper::Selector::parse("span").unwrap();
  427. for span in document.select(&span_selector) {
  428. let data = span.attr("data-usfm");
  429. if data.is_some() {
  430. // Ok, iterate over these items...
  431. for data_span in span.select(&span_selector) {
  432. let d = data_span.attr("class");
  433. // println!("data_span: {:?}", data_span);
  434. if d.is_some() {
  435. let dt = d.unwrap();
  436. if dt.contains("content") {
  437. let text = data_span.text().collect::<Vec<_>>();
  438. println!("{} {:?}", data.unwrap(), text)
  439. }
  440. }
  441. }
  442. /*
  443. GEN.1.25 ["And God made the beasts of the earth according to their kinds and the livestock according to their kinds, and everything that creeps on the ground according to its kind. And God saw that it was good."]
  444. GEN.1.25 [" "]
  445. GEN.1.26 ["Then God said, "]
  446. GEN.1.26 ["“Let us make man"]
  447. GEN.1.26 [" in our image, "]
  448. GEN.1.26 ["after our likeness. And "]
  449. GEN.1.26 ["let them have dominion over the fish of the sea and over the birds of the heavens and over the livestock and over all the earth and over every creeping thing that creeps on the earth.”"]
  450. GEN.1.26 [" "]
  451. GEN.1.27 ["So God created man in his own image,"]
  452. GEN.1.27 ["in the image of God he created him;"]
  453. GEN.1.27 [" "]
  454. GEN.1.27 ["male and female he created them."]
  455. GEN.1.27 [" "]
  456. */
  457. /*
  458. let mut text = span.text().collect::<Vec<_>>();
  459. println!("span: {} {:?}", data.unwrap(), text);
  460. */
  461. }
  462. }
  463. }
  464. // let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
  465. */
  466. /*
  467. Book/chapter: <h1>
  468. <h1>Genesis 1</h1>
  469. <div data-usfm="">, <span data-usfm="">
  470. <div data-usfm="GEN.1" class=
  471. "ChapterContent_chapter__uvbXo">
  472. <div class="ChapterContent_label__R2PLt">
  473. 1
  474. </div>
  475. <div class="ChapterContent_s1__bNNaW">
  476. <span class=
  477. "ChapterContent_heading__xBDcs">The
  478. Creation of the World</span>
  479. </div>
  480. <div class="ChapterContent_p__dVKHb">
  481. <span data-usfm="GEN.1.1" class=
  482. "ChapterContent_verse__57FIw"><span class="ChapterContent_label__R2PLt">
  483. 1</span><span class=
  484. "ChapterContent_content__RrUqA">In
  485. the</span> <span class=
  486. "ChapterContent_note__YlDW0 ChapterContent_x__tsTlk">
  487. <span class=
  488. "ChapterContent_label__R2PLt">#</span><span class="ChapterContent_body__O3qjr">Job
  489. 38:4-7; Ps. 33:6; 136:5; Isa. 42:5;
  490. 45:18; John 1:1-3; Acts 14:15; 17:24;
  491. Col. 1:16, 17; Heb. 1:10; 11:3; Rev.
  492. 4:11</span></span><span class=
  493. "ChapterContent_content__RrUqA">beginning,
  494. God created the heavens and the
  495. earth.</span></span>
  496. <span data-usfm=
  497. "GEN.1.2" class=
  498. "ChapterContent_verse__57FIw"><span class="ChapterContent_label__R2PLt">
  499. 2</span><span class=
  500. "ChapterContent_content__RrUqA">The
  501. earth was</span> <span class=
  502. "ChapterContent_note__YlDW0 ChapterContent_x__tsTlk">
  503. <span class=
  504. "ChapterContent_label__R2PLt">#</span><span class="ChapterContent_body__O3qjr">Jer.
  505. 4:23</span></span><span class=
  506. "ChapterContent_content__RrUqA">without
  507. form and void, and darkness was over
  508. the face of the deep. And the Spirit of
  509. God was hovering over the face of the
  510. waters.</span></span>
  511. Next page link:
  512. <div class="[pointer-events:all]">
  513. <a href="/bible/59/GEN.2.ESV">
  514. */