main.rs 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. use clap::{Parser, Subcommand};
  2. use reqwest;
  3. use scraper;
  4. use std::{
  5. collections::HashMap,
  6. fs::File,
  7. io::Write,
  8. path::{Path, PathBuf},
  9. string::String,
  10. sync::LazyLock,
  11. };
  12. use std::{thread, time::Duration};
  13. // Setup the command line options
  14. #[derive(Parser)]
  15. #[command(about, long_about=None)]
  16. struct Cli {
  17. /// Working directory
  18. #[arg(short, long, default_value = "bible")]
  19. work: PathBuf,
  20. /// Bible Version
  21. #[arg(short, long, default_value = "ESV")]
  22. version: String,
  23. #[command(subcommand)]
  24. command: Option<Commands>,
  25. }
  26. #[derive(Subcommand)]
  27. enum Commands {
  28. /// Fetch from the web, using work directory for cache
  29. Fetch {
  30. /// Delay
  31. #[arg(short, long, default_value = "10")]
  32. delay: u32,
  33. },
  34. /// Extract information from cached files
  35. Extract {
  36. /// Count
  37. #[arg(short, long, default_value = "5")]
  38. count: u32,
  39. /// All
  40. #[arg(short, long, action=clap::ArgAction::SetTrue)]
  41. all: bool,
  42. },
  43. /// Test something out
  44. Test {},
  45. }
  46. static APP_USER_AGENT: &str =
  47. "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0";
  48. static BASE_URL: &str = "https://www.bible.com";
  49. static VERSION_URLS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
  50. HashMap::from([
  51. ("ESV", "https://www.bible.com/bible/59/GEN.1.ESV"),
  52. ("KJV", "https://www.bible.com/bible/1/GEN.1.KJV"),
  53. ("NIV", "https://www.bible.com/bible/111/GEN.INTRO1.NIV"),
  54. // https://www.bible.com/bible/111/GEN.1.NIV"),
  55. ("YLT98", "https://www.bible.com/bible/821/GEN.1.YLT98"),
  56. ])
  57. });
  58. static BOOKS: LazyLock<Vec<&str>> = LazyLock::new(|| {
  59. Vec::from([
  60. "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT", "1SA", "2SA", "1KI", "2KI", "1CH",
  61. "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER", "LAM", "EZK",
  62. "DAN", "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL",
  63. "MAT", "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", "COL", "1TH",
  64. "2TH", "1TI", "2TI", "TIT", "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD",
  65. "REV",
  66. ])
  67. });
  68. static BOOK_MAP: LazyLock<HashMap<&str, usize>> =
  69. LazyLock::new(|| {
  70. HashMap::from_iter(BOOKS.iter().enumerate().map(|x| (*x.1, x.0 + 1)))});
  71. // find_files in base_dir that end with extension bible version.
  72. fn find_files(base_dir: &str, version: &str) -> Vec<String> {
  73. let paths = std::fs::read_dir(base_dir).unwrap();
  74. let mut result = Vec::<String>::new();
  75. for path in paths {
  76. if let Ok(dir) = path {
  77. let filename = dir.file_name().to_string_lossy().to_string();
  78. if filename.ends_with(version) {
  79. result.push(filename);
  80. // result.push(dir.path().to_string_lossy().to_string());
  81. }
  82. }
  83. }
  84. let sorter_helper = |x:&String| -> (usize,i32) {
  85. let v : Vec<&str> = x.split(".").collect();
  86. let mut b:usize = 0;
  87. if BOOK_MAP.contains_key(v[0]) {
  88. b = BOOK_MAP[v[0]];
  89. }
  90. let c:i32 = v[1].parse().unwrap_or(0);
  91. (b,c)
  92. };
  93. // 1. Make it work. 2. Make it fast.
  94. // It would be nice to sort these (by book and chapter), so they are in order.
  95. // Should I just return file_names instead of path?
  96. result.sort_by(|a, b| {
  97. let a_v = sorter_helper(a);
  98. let b_v = sorter_helper(b);
  99. a_v.cmp(&b_v)
  100. });
  101. result
  102. }
  103. // Start here
  104. // static URL: &str = "https://www.bible.com/bible/59/PSA.95.ESV";
  105. // "https://www.bible.com/bible/59/GEN.1.ESV";
  106. // And maybe:
  107. // https://www.bible.com/bible/2692/GEN.1.NASB2020
  108. // https://www.bible.com/bible/1/GEN.1.KJV
  109. // https://www.bible.com/bible/2692/GEN.1.NASB2020
  110. // https://www.bible.com/bible/111/GEN.1.NIV
  111. // https://www.bible.com/bible/821/GEN.1.YLT98
  112. // Catholic
  113. // https://www.bible.com/bible/42/GEN.1.CPDV
  114. // Audio
  115. // https://www.bible.com/audio-bible/59/GEN.1.ESV
  116. // <script type="application/ld+json">{"@context":"https://schema.org","@type":"AudioObject","mainEntityOfPage":{"@type":"WebPage","@id":"https://www.bible.com/audio-bible/59/GEN.1.ESV"},"headline":"Audio Bible: Listen to Genesis 1 English Standard Version 2016 ESV","contentUrl":"https://audio-bible-cdn.youversionapi.com/1/32k/GEN/1-9dcefc68c6f7244489f59475fc7a1866.mp3?version_id=59",
  117. // https://www.bible.com/verse-of-the-day
  118. struct FetchResult {
  119. cached: bool,
  120. html: String,
  121. }
  122. fn fetch_cache(work_dir: &str, client: &reqwest::blocking::Client, url: &str) -> FetchResult {
  123. let (_, filename) = url.rsplit_once('/').unwrap();
  124. let path = Path::new(work_dir).join(filename);
  125. if path.exists() {
  126. // File already exists -- use cached version.
  127. let buffer = std::fs::read_to_string(path).unwrap();
  128. return FetchResult {
  129. cached: true,
  130. html: buffer,
  131. };
  132. }
  133. println!("fetch_cache {} => {}", url, filename);
  134. let res = client.get(url).send().unwrap();
  135. let buffer = res.text().unwrap();
  136. let mut file = File::create(path).unwrap();
  137. let _ = file.write_all(buffer.as_bytes());
  138. FetchResult {
  139. cached: false,
  140. html: buffer,
  141. }
  142. }
  143. fn main() {
  144. let cli = Cli::parse();
  145. // println!("Work Dir: {:?}", cli.work);
  146. // println!("Bible: {:?}", cli.bible);
  147. if !VERSION_URLS.contains_key(cli.version.as_str()) {
  148. println!("Sorry, I don't know about Bible Version [{}].", cli.version);
  149. println!("I do know about the following:");
  150. // Keys sorted in order.
  151. for (name, _) in VERSION_URLS.iter() {
  152. println!(" {}", name);
  153. }
  154. return;
  155. }
  156. match &cli.command {
  157. Some(Commands::Fetch { delay }) => {
  158. let client = reqwest::blocking::Client::builder()
  159. .user_agent(APP_USER_AGENT)
  160. .build()
  161. .unwrap();
  162. let mut url = VERSION_URLS[cli.version.as_str()].to_string();
  163. println!("Fetch! [{}] with delay {} secs.", cli.version, delay);
  164. let mut more = true;
  165. while more {
  166. let result = fetch_cache(cli.work.as_os_str().to_str().unwrap(), &client, url.as_str());
  167. more = false;
  168. let document = scraper::Html::parse_document(&result.html);
  169. // TO FIX
  170. // We want to upgrade this to use CSS selectors.
  171. // For now, us the "working" code we have.
  172. let a_selector = scraper::Selector::parse("div>a").unwrap();
  173. for a in document.select(&a_selector) {
  174. // Skip elements with a class attribute
  175. if a.attr("class").is_some() {
  176. continue;
  177. }
  178. if let Some(href) = a.attr("href") {
  179. if href.contains("/bible/") {
  180. let text = a.text().collect::<Vec<_>>();
  181. if text.len() != 1 {
  182. continue;
  183. }
  184. if text[0] != "Next Chapter" {
  185. continue;
  186. }
  187. // Ok! We've found the Next Chapter a element!
  188. if href.starts_with("/") {
  189. url = String::from(BASE_URL) + href;
  190. } else {
  191. url = href.to_string();
  192. }
  193. // println!("Found HREF: {} => {}", href, url);
  194. // panic!("Squirrel alert!");
  195. more = true;
  196. break;
  197. }
  198. }
  199. }
  200. if more {
  201. if !result.cached {
  202. thread::sleep(Duration::from_secs(*delay as u64));
  203. }
  204. }
  205. }
  206. println!("I'm finished fetching!");
  207. }
  208. Some(Commands::Extract { count, all }) => {
  209. println!("Extract...");
  210. let files = find_files(cli.work.to_str().unwrap(), cli.version.as_str());
  211. let filepath = Path::new(&cli.work);
  212. let mut chapters: HashMap<String, String> = HashMap::<String, String>::new();
  213. let mut extractor = |file| {
  214. println!("File: {}", file);
  215. /*
  216. let mut filepath = cli.work.clone();
  217. filepath = filepath.join(file);
  218. */
  219. let buffer = std::fs::read_to_string(filepath.join(file)).unwrap();
  220. let document = scraper::Html::parse_document(&buffer);
  221. let h1_selector = scraper::Selector::parse("h1").unwrap();
  222. let h1 = document.select(&h1_selector).next().unwrap();
  223. println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
  224. // https://programmersportal.com/the-complete-css-selectors-cheat-sheet-with-examples-and-pdf/
  225. // let span_selector = scraper::Selector::parse("span").unwrap();
  226. let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
  227. // parse r#"div>a[href ^="/bible/"]"#
  228. let span_class = scraper::Selector::parse("span[class]").unwrap();
  229. // span[class="ChapterContent_content__RrUqA"]
  230. // let span_class_content = scraper::Selector::parse(r#"span[class~="content"]"#).unwrap();
  231. // OK! ~= probably locates a matching attr line <span class="this that content"> but does not
  232. // match <span class="contains_content">!
  233. let _span_class_content =
  234. scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#)
  235. .unwrap();
  236. for span in document.select(&span_data_usfm) {
  237. // This will always be successful.
  238. if let Some(data) = span.attr("data-usfm") {
  239. // There can be multples of these with matching values.
  240. println!("data-usfm {}:", data);
  241. let lines: String = span
  242. .select(&span_class)
  243. // Only allow elements with attr class that containts "content"
  244. .filter(|x| {
  245. if let Some(c) = x.attr("class") {
  246. if c.contains("content") {
  247. return true;
  248. }
  249. }
  250. false
  251. })
  252. .map(|x| {
  253. // Convert element's text() iterator into a string.
  254. let init = String::new();
  255. let j = x.text().fold(init, |acc, x| {
  256. // print!( ">> {}<< ", x);
  257. let mut s = acc;
  258. if x == " " {
  259. // This would be a break/newline.
  260. s.push_str("\n");
  261. } else {
  262. s.push_str(x);
  263. }
  264. s
  265. });
  266. // println!("j = {}", j);
  267. j
  268. })
  269. .collect();
  270. println!("data {} lines {}", data, lines);
  271. if chapters.contains_key(data) {
  272. chapters.get_mut(data).unwrap().push_str(&lines);
  273. } else {
  274. chapters.insert(data.to_string(), lines);
  275. }
  276. }
  277. }
  278. };
  279. if *all {
  280. println!("Extract All:");
  281. for file in files.iter() {
  282. extractor(file);
  283. }
  284. } else {
  285. println!("Extract {}:", *count);
  286. for file in files.iter().take(*count as usize) {
  287. extractor(file);
  288. }
  289. }
  290. println!("Chapters: {:?}", chapters);
  291. /*
  292. "AMO.8.9": "“And on that day,” declares the Lord God,\n“I will make the sun go down at noonand darken the earth in broad daylight.\n"}
  293. ^ noonand ? Shouldn't that be "noon and"? Check original. Original has a break between them. Check merge routine.
  294. */
  295. /*
  296. // for file in files.iter().take(*count as usize) {
  297. for file in files_iter {
  298. /* ESV
  299. JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
  300. JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
  301. JHN.8.11 ["sin no more.”"]
  302. JHN.8.11 ["]]"] <- What is this? It is the the original HTML.
  303. JHN.8.11 [" "]
  304. */
  305. }
  306. */
  307. }
  308. Some(Commands::Test {}) => {
  309. println!("Testing...");
  310. let path = Path::new(&cli.work).join("GEN.1.NIV");
  311. let buffer = std::fs::read_to_string(path).unwrap();
  312. let document = scraper::Html::parse_document(&buffer);
  313. let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
  314. let _span_class = scraper::Selector::parse("span[class]").unwrap();
  315. let span_selector = scraper::Selector::parse("span").unwrap();
  316. for span in document.select(&span_data_usfm) {
  317. if let Some(data) = span.attr("data-usfm") {
  318. println!("data-usfm {}:", data);
  319. let mut lines = Vec::<&str>::new();
  320. for data_span in span.select(&span_selector) {
  321. if let Some(data_class) = data_span.attr("class") {
  322. if data_class.contains("content") {
  323. let mut text = data_span.text().collect::<Vec<_>>();
  324. println!("{} {:?}", data, text);
  325. lines.append(&mut text);
  326. }
  327. }
  328. }
  329. println!("data {} lines {:?}", data, lines);
  330. }
  331. }
  332. /*
  333. // Test finding div>a[href^="/bible/"]
  334. // let a_selector = scraper::Selector::parse("div>a").unwrap();
  335. let a_selector = scraper::Selector::parse(r#"div>a[href ^="/bible/"]"#).unwrap();
  336. // This reduces down the number of items to check (from ~40 to 4)!
  337. // And invalidates the check for /bible/ !
  338. for a in document.select(&a_selector) {
  339. let text = a.text().collect::<Vec<_>>();
  340. println!("text: {:?}", text);
  341. if let Some(href) = a.attr("href") {
  342. println!("href = {}", href);
  343. }
  344. println!("=====");
  345. }
  346. */
  347. }
  348. None => {
  349. println!("Looking for FETCH or EXTRACT");
  350. println!("I've got nothing to do here...");
  351. }
  352. }
  353. /*
  354. return;
  355. let client = reqwest::blocking::Client::builder()
  356. .user_agent(APP_USER_AGENT)
  357. .build()
  358. .unwrap();
  359. if true {
  360. let mut url = String::from(URL);
  361. let mut previous_url = url.clone();
  362. let mut working = true;
  363. while working {
  364. // Begin the fetching process...
  365. let result = fetch_cache(&client, url.as_str());
  366. working = false;
  367. // Ok, HTML, get parsing!
  368. let document = scraper::Html::parse_document(&result.html);
  369. // For now, I don't care about parsing, just crawling/saving.
  370. // Locate the a href.
  371. let a_selector = scraper::Selector::parse("div>a").unwrap();
  372. for a in document.select(&a_selector) {
  373. let c = a.attr("class");
  374. if c.is_some() {
  375. continue;
  376. }
  377. let href = a.attr("href");
  378. if href.is_some() {
  379. let href = href.unwrap();
  380. if href.contains("/bible/") {
  381. let text = a.text().collect::<Vec<_>>();
  382. if text.len() != 1 {
  383. continue;
  384. }
  385. if text[0] != "Next Chapter" {
  386. continue;
  387. }
  388. // previous_url = url;
  389. url = href.to_string();
  390. if url.starts_with("/") {
  391. url = String::from("https://www.bible.com") + &url;
  392. }
  393. working = true;
  394. break;
  395. }
  396. }
  397. }
  398. if working {
  399. if !result.cached {
  400. // Don't sleep if the results we just got were from the cache.
  401. thread::sleep(Duration::from_secs(10));
  402. }
  403. }
  404. }
  405. println!("I'm finished.");
  406. } else {
  407. // let res = client.get("https://httpbin.org/anything").send().unwrap();
  408. // println!("anything: {}", res.text().unwrap());
  409. let mut file = File::open(Path::new("fetch1.html")).unwrap();
  410. let mut buffer = String::new();
  411. let _ = file.read_to_string(&mut buffer);
  412. drop(file);
  413. /*
  414. let res = client.get(URL).send().unwrap();
  415. let buffer = res.text().unwrap();
  416. println!("{}", res.text().unwrap());
  417. */
  418. let document = scraper::Html::parse_document(&buffer);
  419. let h1_selector = scraper::Selector::parse("h1").unwrap();
  420. let h1 = document.select(&h1_selector).next().unwrap();
  421. println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
  422. // Selectors can't match on "if it exists" or
  423. // return a matching value, type fields.
  424. let span_selector = scraper::Selector::parse("span").unwrap();
  425. for span in document.select(&span_selector) {
  426. let data = span.attr("data-usfm");
  427. if data.is_some() {
  428. // Ok, iterate over these items...
  429. for data_span in span.select(&span_selector) {
  430. let d = data_span.attr("class");
  431. // println!("data_span: {:?}", data_span);
  432. if d.is_some() {
  433. let dt = d.unwrap();
  434. if dt.contains("content") {
  435. let text = data_span.text().collect::<Vec<_>>();
  436. println!("{} {:?}", data.unwrap(), text)
  437. }
  438. }
  439. }
  440. /*
  441. GEN.1.25 ["And God made the beasts of the earth according to their kinds and the livestock according to their kinds, and everything that creeps on the ground according to its kind. And God saw that it was good."]
  442. GEN.1.25 [" "]
  443. GEN.1.26 ["Then God said, "]
  444. GEN.1.26 ["“Let us make man"]
  445. GEN.1.26 [" in our image, "]
  446. GEN.1.26 ["after our likeness. And "]
  447. GEN.1.26 ["let them have dominion over the fish of the sea and over the birds of the heavens and over the livestock and over all the earth and over every creeping thing that creeps on the earth.”"]
  448. GEN.1.26 [" "]
  449. GEN.1.27 ["So God created man in his own image,"]
  450. GEN.1.27 ["in the image of God he created him;"]
  451. GEN.1.27 [" "]
  452. GEN.1.27 ["male and female he created them."]
  453. GEN.1.27 [" "]
  454. */
  455. /*
  456. let mut text = span.text().collect::<Vec<_>>();
  457. println!("span: {} {:?}", data.unwrap(), text);
  458. */
  459. }
  460. }
  461. }
  462. // let res = client.get("https://www.bible.com/bible/59/GEN.1.ESV").send().unwrap();
  463. */
  464. }
  465. /*
  466. Book/chapter: <h1>
  467. <h1>Genesis 1</h1>
  468. <div data-usfm="">, <span data-usfm="">
  469. <div data-usfm="GEN.1" class=
  470. "ChapterContent_chapter__uvbXo">
  471. <div class="ChapterContent_label__R2PLt">
  472. 1
  473. </div>
  474. <div class="ChapterContent_s1__bNNaW">
  475. <span class=
  476. "ChapterContent_heading__xBDcs">The
  477. Creation of the World</span>
  478. </div>
  479. <div class="ChapterContent_p__dVKHb">
  480. <span data-usfm="GEN.1.1" class=
  481. "ChapterContent_verse__57FIw"><span class="ChapterContent_label__R2PLt">
  482. 1</span><span class=
  483. "ChapterContent_content__RrUqA">In
  484. the</span> <span class=
  485. "ChapterContent_note__YlDW0 ChapterContent_x__tsTlk">
  486. <span class=
  487. "ChapterContent_label__R2PLt">#</span><span class="ChapterContent_body__O3qjr">Job
  488. 38:4-7; Ps. 33:6; 136:5; Isa. 42:5;
  489. 45:18; John 1:1-3; Acts 14:15; 17:24;
  490. Col. 1:16, 17; Heb. 1:10; 11:3; Rev.
  491. 4:11</span></span><span class=
  492. "ChapterContent_content__RrUqA">beginning,
  493. God created the heavens and the
  494. earth.</span></span>
  495. <span data-usfm=
  496. "GEN.1.2" class=
  497. "ChapterContent_verse__57FIw"><span class="ChapterContent_label__R2PLt">
  498. 2</span><span class=
  499. "ChapterContent_content__RrUqA">The
  500. earth was</span> <span class=
  501. "ChapterContent_note__YlDW0 ChapterContent_x__tsTlk">
  502. <span class=
  503. "ChapterContent_label__R2PLt">#</span><span class="ChapterContent_body__O3qjr">Jer.
  504. 4:23</span></span><span class=
  505. "ChapterContent_content__RrUqA">without
  506. form and void, and darkness was over
  507. the face of the deep. And the Spirit of
  508. God was hovering over the face of the
  509. waters.</span></span>
  510. Next page link:
  511. <div class="[pointer-events:all]">
  512. <a href="/bible/59/GEN.2.ESV">
  513. */