Browse Source

Updating extract.

Steve Thielemann 2 months ago
parent
commit
72c9b67f17
1 changed files with 104 additions and 11 deletions
  1. 104 11
      src/main.rs

+ 104 - 11
src/main.rs

@@ -40,7 +40,11 @@ enum Commands {
         delay: u32,
     },
     /// Extract information from cached files
-    Extract {},
+    Extract {
+        /// Count
+        #[arg(short, long, default_value = "5")]
+        count: u32,
+    },
     /// Test something out
     Test {},
 }
@@ -213,10 +217,12 @@ fn main() {
             }
             println!("I'm finished fetching!");
         }
-        Some(Commands::Extract {}) => {
+        Some(Commands::Extract { count}) => {
             println!("Extract...");
-            let files = find_files(cli.work.to_str().unwrap(), cli.bible.as_str());
-            for file in files.iter().take(5) {
+            let mut files = find_files(cli.work.to_str().unwrap(), cli.bible.as_str());
+            files.insert(0, String::from("bible/GEN.1.NIV"));
+
+            for file in files.iter().take(*count as usize) {
                 println!("File: {}", file);
                 let buffer = std::fs::read_to_string(Path::new(file)).unwrap();
                 let document = scraper::Html::parse_document(&buffer);
@@ -225,11 +231,74 @@ fn main() {
                 let h1 = document.select(&h1_selector).next().unwrap();
                 println!("h1 = {:?}", h1.text().collect::<Vec<_>>());
 
-                let span_selector = scraper::Selector::parse("span").unwrap();
-                for span in document.select(&span_selector) {
+                // https://programmersportal.com/the-complete-css-selectors-cheat-sheet-with-examples-and-pdf/
+
+                // let span_selector = scraper::Selector::parse("span").unwrap();
+                let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
+                // parse r#"div>a[href ^="/bible/"]"#
+                let span_class = scraper::Selector::parse("span[class]").unwrap();
+                // span[class="ChapterContent_content__RrUqA"]
+                // let span_class_content = scraper::Selector::parse(r#"span[class~="content"]"#).unwrap();
+                // OK!  ~= probably locates a matching attr line <span class="this that content"> but does not
+                // match <span class="contains_content">!
+
+                let span_class_content =
+                    scraper::Selector::parse(r#"span[class="ChapterContent_content__RrUqA"]"#)
+                        .unwrap();
+
+                for span in document.select(&span_data_usfm) {
+                    // This will always be successful.
                     if let Some(data) = span.attr("data-usfm") {
-                        println!("{}:", data);
-                        let mut lines = Vec::<&str>::new();
+                        println!("data-usfm {}:", data);
+                        // let mut lines = String::new();
+
+                        // let mut lines = Vec::<&str>::new();
+                        /*
+                        for ds in span.select(&span_class) {
+                            let c = ds.attr("class").unwrap();
+                            let text = ds.text().collect::<Vec<_>>();
+                            println!("ds class {} text: {:?}", c, text);
+                        }
+                        */
+
+                        // let lines : String = span.select(&span_class_content)
+                        let lines: String = span
+                            .select(&span_class)
+                            // Only allow elements with attr class that containts "content"
+                            .filter(|x| {
+                                if let Some(c) = x.attr("class") {
+                                    if c.contains("content") {
+                                        return true;
+                                    }
+                                }
+                                false
+                            })
+                            .map(|x| {
+                                // Convert element's text() iterator into a string.
+                                let init = String::new();
+                                let j = x.text().fold(init, |acc, x| {
+                                    // print!( ">> {}<< ", x);
+                                    let mut s = acc;
+                                    if x == "  " {
+                                        // This would be a break/newline.
+                                        s.push_str("\n");
+                                    } else {
+                                        s.push_str(x);
+                                    }
+                                    s
+                                });
+                                // println!("j = {}", j);
+                                j
+                            })
+                            .collect();
+
+                        /*
+                        .fold(String::new(), |acc, x| {
+                            format!("{:?}", x)
+                        });
+                        */
+
+                        /*
                         for data_span in span.select(&span_selector) {
                             if let Some(data_class) = data_span.attr("class") {
                                 if data_class.contains("content") {
@@ -239,11 +308,12 @@ fn main() {
                                 }
                             }
                         }
-                        println!("{} {:?}", data, lines);
+                        */
+                        println!("data {} lines {}", data, lines);
                     }
                 }
 
-                /* ESV 
+                /* ESV
                 JHN.8.11 ["She said, “No one, Lord.” And Jesus said, "]
                 JHN.8.11 ["“Neither do I condemn you; go, and from now on "]
                 JHN.8.11 ["sin no more.”"]
@@ -254,11 +324,33 @@ fn main() {
         }
         Some(Commands::Test {}) => {
             println!("Testing...");
-            // Test finding div>a[href^="/bible/"]
+
             let path = Path::new("bible").join("GEN.1.NIV");
             let buffer = std::fs::read_to_string(path).unwrap();
             let document = scraper::Html::parse_document(&buffer);
 
+            let span_data_usfm = scraper::Selector::parse("span[data-usfm]").unwrap();
+            let _span_class = scraper::Selector::parse("span[class]").unwrap();
+            let span_selector = scraper::Selector::parse("span").unwrap();
+
+            for span in document.select(&span_data_usfm) {
+                if let Some(data) = span.attr("data-usfm") {
+                    println!("data-usfm {}:", data);
+                    let mut lines = Vec::<&str>::new();
+                    for data_span in span.select(&span_selector) {
+                        if let Some(data_class) = data_span.attr("class") {
+                            if data_class.contains("content") {
+                                let mut text = data_span.text().collect::<Vec<_>>();
+                                println!("{} {:?}", data, text);
+                                lines.append(&mut text);
+                            }
+                        }
+                    }
+                    println!("data {} lines {:?}", data, lines);
+                }
+            }
+            /*
+            // Test finding div>a[href^="/bible/"]
             // let a_selector = scraper::Selector::parse("div>a").unwrap();
             let a_selector = scraper::Selector::parse(r#"div>a[href ^="/bible/"]"#).unwrap();
 
@@ -273,6 +365,7 @@ fn main() {
                 }
                 println!("=====");
             }
+            */
         }
         None => {
             println!("Looking for FETCH or EXTRACT");