// use sha256; use std::fs::{File, create_dir_all, read_dir, remove_file}; use std::io::{BufRead, BufReader, Write}; use std::path::PathBuf; use std::result::Result; use std::time::{Duration, SystemTime}; use url::Url; // Error use std::error::Error as Errorr; use std::fmt; #[deny(missing_docs)] // #[warn(missing_docs)] /// Convert relate to absolute /// /// This can fail if Url is unable to parse, or Url is unable to join. #[must_use] pub fn relative_to_absolute( base_url: &str, relative_href: &str, ) -> Result { let base_url = Url::parse(base_url)?; let new_url = base_url.join(relative_href)?; Ok(new_url.to_string()) } /// Save reqwest::header::HeaderMap to file. /// /// This also stores the url in the file, so I know what URL was called for /// this reqwest. /// /// It has each item on a single line: /// header: value /// The first line will be url: (Which is not part of original header.) pub fn save_headermap( filename: &str, url: &str, header: &reqwest::header::HeaderMap, ) -> Result<(), std::io::Error> { let mut fp = File::create(filename)?; fp.write_all(format!("url: {}\n", url).as_bytes())?; for (key, value) in header.iter() { if let Ok(value) = value.to_str() { fp.write_all(format!("{}: {}\n", key, value).as_bytes())?; } } Ok(()) } /// Load reqwest::header::HeaderMap from file. /// /// This will have the url of the original call in the "url" section. pub fn load_headermap(filename: &str) -> Result { let fp = File::open(filename)?; let mut buffer = BufReader::new(fp); let mut line = String::new(); let mut header = reqwest::header::HeaderMap::new(); loop { if buffer.read_line(&mut line).unwrap() == 0 { break; }; let temp = line.trim_end(); if let Some(parts) = temp.split_once(": ") { let head = reqwest::header::HeaderName::from_bytes(parts.0.as_bytes()).unwrap(); if let Ok(value) = reqwest::header::HeaderValue::from_str(&parts.1) { header.insert(head, value); } } line.clear(); } Ok(header) } /// Caching web calls /// /// Set the directory, and we're ready to make cached web calls. /// Since we're not storing the file in memory now, max_size isn't /// the concern it once was. pub struct Cache { /// Directory where cache is stored pub directory: PathBuf, // *This is where we would select async or blocking.* /// Reqwest Client pub client: reqwest::blocking::Client, /// Vector of content-types to accept (empty=all) pub accept: Vec, /// Max size of content to download (default unlimited) pub max_size: Option, } // Should I also have std::io::Errors in here as well? // I can have File IO errors. /// Status of fetch #[allow(dead_code)] #[derive(Debug)] #[repr(u8)] pub enum Status { /// File was downloaded. Fetched(PathBuf), /// File was retrieved from cache. Cached(PathBuf), } impl Status { /// Return pathbuf, always pub fn download_path(&self) -> &PathBuf { match self { Status::Fetched(path) | Status::Cached(path) => { return path; } } } } #[derive(Debug)] pub enum Error { /// Reqwest error (unable to connect), or IO Error std::io::Error ReqwestError(reqwest::Error), IOError(std::io::Error), /// Content-Type wasn't allowed, see Cache.accept. Unacceptable(String), // Content-Type /// Content was too big, see Cache.max_size. TooBig(u64), /// HTTP Error/status code. HttpErrorStatus(u16), } // This allows ? to return cache::Error from std::io::Error (see expire) impl From for Error { fn from(e: std::io::Error) -> Self { Self::IOError(e) } } impl From for Error { fn from(e: reqwest::Error) -> Self { Self::ReqwestError(e) } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Error::ReqwestError(e) => write!(f, "ReqwestError: {:?}", e), Error::IOError(e) => write!(f, "IOError: {:?}", e), Error::Unacceptable(ct) => write!(f, "Content-Type {} not allowed", ct), Error::TooBig(size) => write!(f, "Content-Size {} too big", size), Error::HttpErrorStatus(status) => write!(f, "Status Code: {}", status), } } } // This made anyhow happy with my cache::Error. impl Errorr for Error {} /* Some possible content-type values: We're only interested in a few of these... text/css text/javascript text/plain image/jpeg image/png iamge/gif application/xml application/javascript */ // If nothing is given for useragent, we default to the application name and version. static APP_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); static HEADER_EXT: &str = ".header"; impl Cache { /// Construct Cache using given directory for caching, and useragent. pub fn new(dir: PathBuf, useragent: Option<&str>) -> Result { // Verify the directory exists let path = dir.as_path(); if path.exists() { if !path.is_dir() { // It exists, but it isn't a directory! What?! return Err(Error::IOError(std::io::Error::new( std::io::ErrorKind::Other, format!( "Can't create Cache dir {}, it already exists.", dir.display() ), ))); } } else { match create_dir_all(path) { Err(e) => { return Err(Error::IOError(e)); } Ok(_) => {} } } let user_agent = if let Some(ua) = useragent { ua } else { APP_USER_AGENT }; // This is where we select async or blocking. match reqwest::blocking::Client::builder() .user_agent(user_agent) .build() { Ok(client) => { Ok(Self { directory: dir, client: client, accept: vec![], // Accept all content-type. max_size: None, // Some(256 * 1024 * 1024), // 256 MB }) } Err(e) => { // Client::builder error return Err(Error::ReqwestError(e)); } } } #[allow(dead_code)] pub fn add_content_type(&mut self, content_type: String) { self.accept.push(content_type); } #[allow(dead_code)] pub fn clear_content_type(&mut self) { self.accept.clear(); } #[allow(dead_code)] pub fn set_max_size(&mut self, size: u64) { self.max_size = Some(size); } #[allow(dead_code)] pub fn clear_max_size(&mut self) { self.max_size = None; } /// Create safe filename from url for header/content files. pub fn url_to_basename(url: &str) -> String { let filename = if url.ends_with("/") { "" } else { if let Some(has_file) = url.rsplit_once("/") { has_file.1 } else { "" } }; if filename.is_empty() { // Getting the filename part failed. // Like in cases where the url is https://go.dev/dl/ // Which becomes go.dev-dl let mut path = url.to_string(); path = path.replace("https://", ""); path = path.replace("http://", ""); path = path.replace("/", "-"); path = path.replace(".", "-"); if path.ends_with("-") { path.pop(); } return path; } filename.to_string() } /// Expire files in the cache older then given age /// /// Use DirEntry.modified, since it updates when a file is freshened/downloaded. /// DirEntry.created isn't updated when file is rewritten. #[allow(dead_code)] pub fn expire(&self, age: Duration) -> Result { let now = SystemTime::now(); let mut result: bool = false; for file in read_dir(self.directory.as_path())? { let file = file?; if let Ok(d) = file.metadata() { if d.is_file() { // Created isn't updated if the file is fetched. Use modified, that updates on fetch. let filename = String::from(file.file_name().to_str().unwrap()); if filename.ends_with(HEADER_EXT) { // This is a header cache file... if let Ok(modify) = d.modified() { if let Ok(delta) = now.duration_since(modify) { // println!("expire {} = modified {}", filename, delta.as_secs()); if delta > age { // println!("Would delete: {} (and .content)", filename); let mut filepath = self.directory.join(filename); let r = remove_file(&filepath); if let Err(e) = r { println!("RemoveFile {:?}: {}", filepath, e); } // Also delete .content ! // Which is trickier to find now... Self::remove_from_filename(&mut filepath); // filepath.set_extension("content"); let r = remove_file(&filepath); if let Err(e) = r { println!("RemoveFile {:?}: {}", filepath, e); } result = true; } } } } /* if let Ok(access) = d.accessed() { if let Ok(delta) = now.duration_since(access) { println!("accessed {:?} = accessed {}", file.file_name(), delta.as_secs()); if delta > age { println!("Expire: {:?}", file.file_name()); } } } if let Ok(created) = d.created() { if let Ok(delta) = now.duration_since(created) { println!("expire {:?} = created {}", file.file_name(), delta.as_secs()); if delta > age { println!("Would delete: {:?}", file.file_name()); result = true; } } } */ } } } Ok(result) } /// Given a url, return the filename /// /// The filename might not exist. It is only the filename /// that would be used for the given url. pub fn filename_for_url(&self, url: &str) -> PathBuf { self.directory.as_path().join(Self::url_to_basename(url)) } /// Given a url, return an open file /// /// Reading from cache. #[allow(dead_code)] pub fn file(&self, url: &str) -> Option { let base = self.filename_for_url(url); /* let base = self .directory .as_path() .join(Self::url_to_basename(url).unwrap()); */ if base.exists() { return Some(File::open(base).unwrap()); } None } /// Return filename from pathbuf as String #[allow(dead_code)] #[must_use] fn pathbuf_filename(path: &PathBuf) -> String { path.file_name().unwrap().to_string_lossy().to_string() } /// Add to the PathBuf filename /// /// This is different then PathBuf::set_extension /// which replaces everything. fn append_to_filename(path: &mut PathBuf, append: &str) { // Append to the filename. let filename = path.file_name().unwrap().to_string_lossy().to_string() + append; path.set_file_name(filename); } /// Remove an extension from the filename. /// /// Given something.tar.gz.header return something.tar.gz fn remove_from_filename(path: &mut PathBuf) { let filename = Self::pathbuf_filename(path); if let Some(parts) = filename.rsplit_once(".") { path.set_file_name(parts.0); } else { panic!( "Unable to locate the trailing extension . from: {}", path.display() ); } } /// Fetch, without using the cache. /// /// This deletes the .header cache file, which forces a fetch. #[allow(dead_code)] pub fn fetch_nocache(&self, url: &str) -> Result { let mut base = self.filename_for_url(url); Self::append_to_filename(&mut base, HEADER_EXT); if base.exists() { match remove_file(&base) { Err(e) => { // unlink failed return Err(Error::IOError(e)); } Ok(_) => {} } } return self.fetch(url); } // I'm not sure about using Result here... // It would allow for ? usage. /// Fetch the URL from the web /// /// This returns Status, which could be Fetched or Cached copy (among other things). #[must_use] pub fn fetch(&self, url: &str) -> Result { let base = self.filename_for_url(url); /* let base = self .directory .as_path() .join(Self::url_to_basename(url).unwrap()); */ let mut builder = self.client.get(url); // Don't send just yet! // Set some headers to see if page content has changed. let mut header_file = base.clone(); Self::append_to_filename(&mut header_file, HEADER_EXT); if header_file.exists() { // Ok! We have existing information. Retrieve it. match load_headermap(header_file.to_str().unwrap()) { Ok(old_header) => { // Look for: ETag, Last-Modified if let Some(lastmod) = old_header.get("Last-Modified") { builder = builder.header("If-Modified-Since", lastmod); } else if let Some(date) = old_header.get("Date") { // Keep trying... builder = builder.header("If-Modified-Since", date); } if let Some(etag) = old_header.get("etag") { builder = builder.header("If-None-Match", etag); } } Err(e) => { return Err(Error::IOError(e)); } } }; match builder.send() { Ok(mut result) => { if result.status() == 304 { // Cache hit! return Ok(Status::Cached(base)); } // Ok! Success! if result.status() == 200 { // Success! // When caching fails ― // // If content_length (from previous fetch) matches current? // Could we assume it hasn't changed, and just use cache? // Or would that be a big assumption? // Only check content_length size, if we have been // given a max_size. if let Some(max_size) = self.max_size { if let Some(len) = result.content_length() { if len > max_size { // Is there a way to abort this safely? Apparently yes! :D // let byte = Byte::from_u64(len); // let adjusted_byte = byte.get_appropriate_unit(UnitType::Binary); // println!("Too Big! {adjusted_byte:.2} {}", url); return Err(Error::TooBig(len)); } } } // Only check acceptable content_types if given. if !self.accept.is_empty() { if let Some(content_type) = result.headers().get("content-type") { // Check to see if accepted content. let mut ct = content_type.to_str().unwrap(); let possible = content_type.to_str().unwrap().split_once(';'); if let Some((ct_part, _)) = possible { ct = ct_part; } if !self.accept.contains(&ct.to_string()) { // println!("Unacceptable content-type {} {}", ct, url); return Err(Error::Unacceptable(ct.to_string())); } } } match save_headermap(header_file.to_str().unwrap(), url, result.headers()) { Err(e) => { return Err(Error::IOError(e)); } Ok(()) => {} } match File::create(base.to_str().unwrap()) { Ok(mut fp) => match result.copy_to(&mut fp) { Ok(_) => {} Err(e) => { return Err(Error::ReqwestError(e)); } }, Err(e) => { return Err(Error::IOError(e)); } } // result.copy_to(&mut fp)?; /* // async while let Ok(Some(chunk)) = result.chunk().await { let _ = fp.write(&chunk); } */ return Ok(Status::Fetched(base)); } else { // Status error // println!("Error {} {}", result.status(), url); return Err(Error::HttpErrorStatus(u16::from(result.status()))); } } Err(e) => { return Err(Error::ReqwestError(e)); } } } } /* https://httpbin.org/anything /headers /ip /user-agent /status/404 /status/200 /cache/value for cache-control /cache (if-modified-since or if-none-match are present, returns 304) /etag/value for etag (if-none-match or if-match) /uuid /brotli /deflate /gzip ^ I wonder what happens if I request one that isn't enabled in reqwest? */ #[cfg(test)] mod tests { use super::*; use std::collections::HashMap; use testdir::testdir; #[test] fn relative_test() { let rel_abs: HashMap<(&str, &str), &str> = HashMap::from([ ( ("http://meow.org/rabbit", "/llama/index.html"), "http://meow.org/llama/index.html", ), ( ("https://example.com/dir/index.html", "about.html"), "https://example.com/dir/about.html", ), ( ("https://example.com/dir/index.html", "../and/about.html"), "https://example.com/and/about.html", ), ( ( "https://here.com/dir/index.html", "http://there.com/about.html", ), "http://there.com/about.html", ), ]); for (base, url) in rel_abs { if let Ok(abs) = relative_to_absolute(base.0, base.1) { assert_eq!(abs, url, "Base {}, Rel {} => {}", base.0, base.1, url); } else { panic!("Failed {} + {} => {}", base.0, base.1, url); } } } #[test] fn url_to_filename_test() { let mut dir = testdir!(); dir.push("cache"); let temp = dir.clone(); let cache = Cache::new(dir, None).unwrap(); // url_to_basename let url_base: HashMap<&str, &str> = HashMap::from([ ("https://go.dev/dl/go1.23.45.tar.gz", "go1.23.45.tar.gz"), ("https://go.dev/dl", "dl"), ("https://go.dev/dl/", "go-dev-dl"), ]); for (url, base) in url_base { // Verify url_to_basename. let basename = Cache::url_to_basename(url); assert_eq!(base, basename, "{} -> {}", url, base); // Verify filename_for_url. let path = cache.filename_for_url(url); let mut newpath = temp.clone(); newpath.push(base); assert_eq!(path.as_os_str(), newpath.as_os_str(), "{} -> {}", url, base); } for filename in vec!["go1.23.45.tar.gz", "test.html"] { let newname = String::from(filename) + HEADER_EXT; let mut newpath = temp.clone(); newpath.set_file_name(filename); Cache::append_to_filename(&mut newpath, HEADER_EXT); assert_eq!( &newpath.file_name().unwrap().to_string_lossy().to_string(), &newname, "{} {}", filename, HEADER_EXT ); // Test to make sure this removes HEADER_EXT from the filename. Cache::remove_from_filename(&mut newpath); assert_eq!( &newpath.file_name().unwrap().to_string_lossy().to_string(), filename, "{}", filename ) } } #[test] #[cfg(not(feature = "local-httpbin"))] fn cache_fetch() { let mut dir = testdir!(); dir.push("cache"); // Make a copy of the cache directory PathBuf for verifying paths. let mut t = dir.clone(); let cache = Cache::new(dir, None).unwrap(); let r = cache.fetch("https://httpbin.org/anything"); t.push("anything"); if let Ok(r) = r { if let Status::Fetched(f) = r { assert!(f.exists(), "Cache file exists."); assert_eq!(f, t, "Cache path is what we were expecting."); let mut header_file = t.clone(); Cache::append_to_filename(&mut header_file, HEADER_EXT); assert!(header_file.exists(), "Cache header file exists."); t.pop(); t.push("anything.header"); assert_eq!(header_file, t, "Cache header path is what we expected."); } else { panic!("Cache Status is not Status::Fetched, is: {:?}", r); } } else { panic!("cache.fetch: {:?}", r); } } /* Add to Config.toml: [features] local-httpbin = [] Use: #[test] #[cfg(feature = "local-httpbin")] And then: cargo test -F local-httpbin -- --show-output This runs the local httpbin tests. */ #[test] #[cfg(feature = "local-httpbin")] fn call_local() { let mut dir = testdir!(); dir.push("cache"); // Make a copy of the cache directory PathBuf for verifying paths. let mut t = dir.clone(); let cache = Cache::new(dir, None).unwrap(); let teapot_url = "http://127.0.0.1/status/418"; let r = cache.fetch(&teapot_url); if let Err(e) = r { if let Error::HttpErrorStatus(code) = e { assert_eq!(code, 418); } else { panic!("Not an ErrorStatus"); } } else { panic!("Unexpected error: {r:?}"); } // println!("{:?}", r); let r = cache.fetch("http://127.0.0.1:1024"); assert!(r.is_err(), "Confirm connection error"); /* I disabled brotli in the Client builder. I get an error below about invalid UTF-8. The httpbin server isn't smart enough to see I don't support it, and sends it anyway. :( */ /* let brot_url = "http://127.0.0.1/brotli"; let r = cache.fetch(brot_url); println!("Brotli: {:?}", r); if let Status::Fetched(path) = r { let data = std::fs::read_to_string(path).unwrap(); println!("DATA:\n{}", data); } */ } /* These tests require a running local httpbin image. ``` services: httpbin: image: kennethreitz/httpbin ports: - "80:80" ``` */ #[test] #[cfg(feature = "local-httpbin")] fn cache_local() { let mut dir = testdir!(); dir.push("cache"); // Make a copy of the cache directory PathBuf for verifying paths. let mut t = dir.clone(); let cache = Cache::new(dir, None).unwrap(); let etag_url = "http://127.0.0.1/etag/meow"; let r = cache.fetch(&etag_url); if let Ok(r) = r { match r { Status::Fetched(_) => {} _ => { panic!("Expected Status::Fetched on 1st request."); } } } else { panic!("Unexpected error: {r:?}"); } // 2nd call, the etag header is set. let r2 = cache.fetch(&etag_url); if let Ok(r2) = r2 { match r2 { Status::Cached(_) => {} _ => { panic!("Expected Status::Cached on 2nd request."); } } } else { panic!("Unexpected error: {r2:?}"); } // println!("{:?}\n{:?}", r, r2); } }