123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811 |
- // use sha256;
- use std::fs::{File, create_dir_all, read_dir, remove_file};
- use std::io::{BufRead, BufReader, Write};
- use std::path::PathBuf;
- use std::result::Result;
- use std::time::{Duration, SystemTime};
- use url::Url;
- // Error
- use std::error::Error as Errorr;
- use std::fmt;
- #[deny(missing_docs)]
- // #[warn(missing_docs)]
- /// Convert relate to absolute
- ///
- /// This can fail if Url is unable to parse, or Url is unable to join.
- #[must_use]
- pub fn relative_to_absolute(
- base_url: &str,
- relative_href: &str,
- ) -> Result<String, url::ParseError> {
- let base_url = Url::parse(base_url)?;
- let new_url = base_url.join(relative_href)?;
- Ok(new_url.to_string())
- }
- /// Save reqwest::header::HeaderMap to file.
- ///
- /// This also stores the url in the file, so I know what URL was called for
- /// this reqwest.
- ///
- /// It has each item on a single line:
- /// header: value
- /// The first line will be url: (Which is not part of original header.)
- pub fn save_headermap(
- filename: &str,
- url: &str,
- header: &reqwest::header::HeaderMap,
- ) -> Result<(), std::io::Error> {
- let mut fp = File::create(filename)?;
- fp.write_all(format!("url: {}\n", url).as_bytes())?;
- for (key, value) in header.iter() {
- if let Ok(value) = value.to_str() {
- fp.write_all(format!("{}: {}\n", key, value).as_bytes())?;
- }
- }
- Ok(())
- }
- /// Load reqwest::header::HeaderMap from file.
- ///
- /// This will have the url of the original call in the "url" section.
- pub fn load_headermap(filename: &str) -> Result<reqwest::header::HeaderMap, std::io::Error> {
- let fp = File::open(filename)?;
- let mut buffer = BufReader::new(fp);
- let mut line = String::new();
- let mut header = reqwest::header::HeaderMap::new();
- loop {
- if buffer.read_line(&mut line).unwrap() == 0 {
- break;
- };
- let temp = line.trim_end();
- if let Some(parts) = temp.split_once(": ") {
- let head = reqwest::header::HeaderName::from_bytes(parts.0.as_bytes()).unwrap();
- if let Ok(value) = reqwest::header::HeaderValue::from_str(&parts.1) {
- header.insert(head, value);
- }
- }
- line.clear();
- }
- Ok(header)
- }
- /// Caching web calls
- ///
- /// Set the directory, and we're ready to make cached web calls.
- /// Since we're not storing the file in memory now, max_size isn't
- /// the concern it once was.
- pub struct Cache {
- /// Directory where cache is stored
- pub directory: PathBuf,
- // *This is where we would select async or blocking.*
- /// Reqwest Client
- pub client: reqwest::blocking::Client,
- /// Vector of content-types to accept (empty=all)
- pub accept: Vec<String>,
- /// Max size of content to download (default unlimited)
- pub max_size: Option<u64>,
- }
- // Should I also have std::io::Errors in here as well?
- // I can have File IO errors.
- /// Status of fetch
- #[allow(dead_code)]
- #[derive(Debug)]
- #[repr(u8)]
- pub enum Status {
- /// File was downloaded.
- Fetched(PathBuf),
- /// File was retrieved from cache.
- Cached(PathBuf),
- }
- impl Status {
- /// Return pathbuf, always
- pub fn download_path(&self) -> &PathBuf {
- match self {
- Status::Fetched(path) | Status::Cached(path) => {
- return path;
- }
- }
- }
- }
- #[derive(Debug)]
- pub enum Error {
- /// Reqwest error (unable to connect), or IO Error std::io::Error
- ReqwestError(reqwest::Error),
- IOError(std::io::Error),
- /// Content-Type wasn't allowed, see Cache.accept.
- Unacceptable(String), // Content-Type
- /// Content was too big, see Cache.max_size.
- TooBig(u64),
- /// HTTP Error/status code.
- HttpErrorStatus(u16),
- }
- // This allows ? to return cache::Error from std::io::Error (see expire)
- impl From<std::io::Error> for Error {
- fn from(e: std::io::Error) -> Self {
- Self::IOError(e)
- }
- }
- impl From<reqwest::Error> for Error {
- fn from(e: reqwest::Error) -> Self {
- Self::ReqwestError(e)
- }
- }
- impl fmt::Display for Error {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match self {
- Error::ReqwestError(e) => write!(f, "ReqwestError: {:?}", e),
- Error::IOError(e) => write!(f, "IOError: {:?}", e),
- Error::Unacceptable(ct) => write!(f, "Content-Type {} not allowed", ct),
- Error::TooBig(size) => write!(f, "Content-Size {} too big", size),
- Error::HttpErrorStatus(status) => write!(f, "Status Code: {}", status),
- }
- }
- }
- // This made anyhow happy with my cache::Error.
- impl Errorr for Error {}
- /*
- Some possible content-type values: We're only interested in a few of these...
- text/css
- text/javascript
- text/plain
- image/jpeg
- image/png
- iamge/gif
- application/xml
- application/javascript
- */
- // If nothing is given for useragent, we default to the application name and version.
- static APP_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),);
- static HEADER_EXT: &str = ".header";
- impl Cache {
- /// Construct Cache using given directory for caching, and useragent.
- pub fn new(dir: PathBuf, useragent: Option<&str>) -> Result<Self, Error> {
- // Verify the directory exists
- let path = dir.as_path();
- if path.exists() {
- if !path.is_dir() {
- // It exists, but it isn't a directory! What?!
- return Err(Error::IOError(std::io::Error::new(
- std::io::ErrorKind::Other,
- format!(
- "Can't create Cache dir {}, it already exists.",
- dir.display()
- ),
- )));
- }
- } else {
- match create_dir_all(path) {
- Err(e) => {
- return Err(Error::IOError(e));
- }
- Ok(_) => {}
- }
- }
- let user_agent = if let Some(ua) = useragent {
- ua
- } else {
- APP_USER_AGENT
- };
- // This is where we select async or blocking.
- match reqwest::blocking::Client::builder()
- .user_agent(user_agent)
- .build()
- {
- Ok(client) => {
- Ok(Self {
- directory: dir,
- client: client,
- accept: vec![], // Accept all content-type.
- max_size: None, // Some(256 * 1024 * 1024), // 256 MB
- })
- }
- Err(e) => {
- // Client::builder error
- return Err(Error::ReqwestError(e));
- }
- }
- }
- #[allow(dead_code)]
- pub fn add_content_type(&mut self, content_type: String) {
- self.accept.push(content_type);
- }
- #[allow(dead_code)]
- pub fn clear_content_type(&mut self) {
- self.accept.clear();
- }
- #[allow(dead_code)]
- pub fn set_max_size(&mut self, size: u64) {
- self.max_size = Some(size);
- }
- #[allow(dead_code)]
- pub fn clear_max_size(&mut self) {
- self.max_size = None;
- }
- /// Create safe filename from url for header/content files.
- pub fn url_to_basename(url: &str) -> String {
- let filename = if url.ends_with("/") {
- ""
- } else {
- if let Some(has_file) = url.rsplit_once("/") {
- has_file.1
- } else {
- ""
- }
- };
- if filename.is_empty() {
- // Getting the filename part failed.
- // Like in cases where the url is https://go.dev/dl/
- // Which becomes go.dev-dl
- let mut path = url.to_string();
- path = path.replace("https://", "");
- path = path.replace("http://", "");
- path = path.replace("/", "-");
- path = path.replace(".", "-");
- if path.ends_with("-") {
- path.pop();
- }
- return path;
- }
- filename.to_string()
- }
- /// Expire files in the cache older then given age
- ///
- /// Use DirEntry.modified, since it updates when a file is freshened/downloaded.
- /// DirEntry.created isn't updated when file is rewritten.
- #[allow(dead_code)]
- pub fn expire(&self, age: Duration) -> Result<bool, Error> {
- let now = SystemTime::now();
- let mut result: bool = false;
- for file in read_dir(self.directory.as_path())? {
- let file = file?;
- if let Ok(d) = file.metadata() {
- if d.is_file() {
- // Created isn't updated if the file is fetched. Use modified, that updates on fetch.
- let filename = String::from(file.file_name().to_str().unwrap());
- if filename.ends_with(HEADER_EXT) {
- // This is a header cache file...
- if let Ok(modify) = d.modified() {
- if let Ok(delta) = now.duration_since(modify) {
- // println!("expire {} = modified {}", filename, delta.as_secs());
- if delta > age {
- // println!("Would delete: {} (and .content)", filename);
- let mut filepath = self.directory.join(filename);
- let r = remove_file(&filepath);
- if let Err(e) = r {
- println!("RemoveFile {:?}: {}", filepath, e);
- }
- // Also delete .content !
- // Which is trickier to find now...
- Self::remove_from_filename(&mut filepath);
- // filepath.set_extension("content");
- let r = remove_file(&filepath);
- if let Err(e) = r {
- println!("RemoveFile {:?}: {}", filepath, e);
- }
- result = true;
- }
- }
- }
- }
- /*
- if let Ok(access) = d.accessed() {
- if let Ok(delta) = now.duration_since(access) {
- println!("accessed {:?} = accessed {}", file.file_name(), delta.as_secs());
- if delta > age {
- println!("Expire: {:?}", file.file_name());
- }
- }
- }
- if let Ok(created) = d.created() {
- if let Ok(delta) = now.duration_since(created) {
- println!("expire {:?} = created {}", file.file_name(), delta.as_secs());
- if delta > age {
- println!("Would delete: {:?}", file.file_name());
- result = true;
- }
- }
- }
- */
- }
- }
- }
- Ok(result)
- }
- /// Given a url, return the filename
- ///
- /// The filename might not exist. It is only the filename
- /// that would be used for the given url.
- pub fn filename_for_url(&self, url: &str) -> PathBuf {
- self.directory.as_path().join(Self::url_to_basename(url))
- }
- /// Given a url, return an open file
- ///
- /// Reading from cache.
- #[allow(dead_code)]
- pub fn file(&self, url: &str) -> Option<File> {
- let base = self.filename_for_url(url);
- /*
- let base = self
- .directory
- .as_path()
- .join(Self::url_to_basename(url).unwrap());
- */
- if base.exists() {
- return Some(File::open(base).unwrap());
- }
- None
- }
- /// Return filename from pathbuf as String
- #[allow(dead_code)]
- #[must_use]
- fn pathbuf_filename(path: &PathBuf) -> String {
- path.file_name().unwrap().to_string_lossy().to_string()
- }
- /// Add to the PathBuf filename
- ///
- /// This is different then PathBuf::set_extension
- /// which replaces everything.
- fn append_to_filename(path: &mut PathBuf, append: &str) {
- // Append to the filename.
- let filename = path.file_name().unwrap().to_string_lossy().to_string() + append;
- path.set_file_name(filename);
- }
- /// Remove an extension from the filename.
- ///
- /// Given something.tar.gz.header return something.tar.gz
- fn remove_from_filename(path: &mut PathBuf) {
- let filename = Self::pathbuf_filename(path);
- if let Some(parts) = filename.rsplit_once(".") {
- path.set_file_name(parts.0);
- } else {
- panic!(
- "Unable to locate the trailing extension . from: {}",
- path.display()
- );
- }
- }
- /// Fetch, without using the cache.
- ///
- /// This deletes the .header cache file, which forces a fetch.
- #[allow(dead_code)]
- pub fn fetch_nocache(&self, url: &str) -> Result<Status, Error> {
- let mut base = self.filename_for_url(url);
- Self::append_to_filename(&mut base, HEADER_EXT);
- if base.exists() {
- match remove_file(&base) {
- Err(e) => {
- // unlink failed
- return Err(Error::IOError(e));
- }
- Ok(_) => {}
- }
- }
- return self.fetch(url);
- }
- // I'm not sure about using Result<Status> here...
- // It would allow for ? usage.
- /// Fetch the URL from the web
- ///
- /// This returns Status, which could be Fetched or Cached copy (among other things).
- #[must_use]
- pub fn fetch(&self, url: &str) -> Result<Status, Error> {
- let base = self.filename_for_url(url);
- /*
- let base = self
- .directory
- .as_path()
- .join(Self::url_to_basename(url).unwrap());
- */
- let mut builder = self.client.get(url);
- // Don't send just yet!
- // Set some headers to see if page content has changed.
- let mut header_file = base.clone();
- Self::append_to_filename(&mut header_file, HEADER_EXT);
- if header_file.exists() {
- // Ok! We have existing information. Retrieve it.
- match load_headermap(header_file.to_str().unwrap()) {
- Ok(old_header) => {
- // Look for: ETag, Last-Modified
- if let Some(lastmod) = old_header.get("Last-Modified") {
- builder = builder.header("If-Modified-Since", lastmod);
- } else if let Some(date) = old_header.get("Date") {
- // Keep trying...
- builder = builder.header("If-Modified-Since", date);
- }
- if let Some(etag) = old_header.get("etag") {
- builder = builder.header("If-None-Match", etag);
- }
- }
- Err(e) => {
- return Err(Error::IOError(e));
- }
- }
- };
- match builder.send() {
- Ok(mut result) => {
- if result.status() == 304 {
- // Cache hit!
- return Ok(Status::Cached(base));
- }
- // Ok! Success!
- if result.status() == 200 {
- // Success!
- // When caching fails ―
- //
- // If content_length (from previous fetch) matches current?
- // Could we assume it hasn't changed, and just use cache?
- // Or would that be a big assumption?
- // Only check content_length size, if we have been
- // given a max_size.
- if let Some(max_size) = self.max_size {
- if let Some(len) = result.content_length() {
- if len > max_size {
- // Is there a way to abort this safely? Apparently yes! :D
- // let byte = Byte::from_u64(len);
- // let adjusted_byte = byte.get_appropriate_unit(UnitType::Binary);
- // println!("Too Big! {adjusted_byte:.2} {}", url);
- return Err(Error::TooBig(len));
- }
- }
- }
- // Only check acceptable content_types if given.
- if !self.accept.is_empty() {
- if let Some(content_type) = result.headers().get("content-type") {
- // Check to see if accepted content.
- let mut ct = content_type.to_str().unwrap();
- let possible = content_type.to_str().unwrap().split_once(';');
- if let Some((ct_part, _)) = possible {
- ct = ct_part;
- }
- if !self.accept.contains(&ct.to_string()) {
- // println!("Unacceptable content-type {} {}", ct, url);
- return Err(Error::Unacceptable(ct.to_string()));
- }
- }
- }
- match save_headermap(header_file.to_str().unwrap(), url, result.headers()) {
- Err(e) => {
- return Err(Error::IOError(e));
- }
- Ok(()) => {}
- }
- match File::create(base.to_str().unwrap()) {
- Ok(mut fp) => match result.copy_to(&mut fp) {
- Ok(_) => {}
- Err(e) => {
- return Err(Error::ReqwestError(e));
- }
- },
- Err(e) => {
- return Err(Error::IOError(e));
- }
- }
- // result.copy_to(&mut fp)?;
- /* // async
- while let Ok(Some(chunk)) = result.chunk().await {
- let _ = fp.write(&chunk);
- }
- */
- return Ok(Status::Fetched(base));
- } else {
- // Status error
- // println!("Error {} {}", result.status(), url);
- return Err(Error::HttpErrorStatus(u16::from(result.status())));
- }
- }
- Err(e) => {
- return Err(Error::ReqwestError(e));
- }
- }
- }
- }
- /*
- https://httpbin.org/anything
- /headers
- /ip
- /user-agent
- /status/404
- /status/200
- /cache/value for cache-control
- /cache (if-modified-since or if-none-match are present, returns 304)
- /etag/value for etag (if-none-match or if-match)
- /uuid
- /brotli
- /deflate
- /gzip
- ^ I wonder what happens if I request one that isn't enabled in reqwest?
- */
- #[cfg(test)]
- mod tests {
- use super::*;
- use std::collections::HashMap;
- use testdir::testdir;
- #[test]
- fn relative_test() {
- let rel_abs: HashMap<(&str, &str), &str> = HashMap::from([
- (
- ("http://meow.org/rabbit", "/llama/index.html"),
- "http://meow.org/llama/index.html",
- ),
- (
- ("https://example.com/dir/index.html", "about.html"),
- "https://example.com/dir/about.html",
- ),
- (
- ("https://example.com/dir/index.html", "../and/about.html"),
- "https://example.com/and/about.html",
- ),
- (
- (
- "https://here.com/dir/index.html",
- "http://there.com/about.html",
- ),
- "http://there.com/about.html",
- ),
- ]);
- for (base, url) in rel_abs {
- if let Ok(abs) = relative_to_absolute(base.0, base.1) {
- assert_eq!(abs, url, "Base {}, Rel {} => {}", base.0, base.1, url);
- } else {
- panic!("Failed {} + {} => {}", base.0, base.1, url);
- }
- }
- }
- #[test]
- fn url_to_filename_test() {
- let mut dir = testdir!();
- dir.push("cache");
- let temp = dir.clone();
- let cache = Cache::new(dir, None).unwrap();
- // url_to_basename
- let url_base: HashMap<&str, &str> = HashMap::from([
- ("https://go.dev/dl/go1.23.45.tar.gz", "go1.23.45.tar.gz"),
- ("https://go.dev/dl", "dl"),
- ("https://go.dev/dl/", "go-dev-dl"),
- ]);
- for (url, base) in url_base {
- // Verify url_to_basename.
- let basename = Cache::url_to_basename(url);
- assert_eq!(base, basename, "{} -> {}", url, base);
- // Verify filename_for_url.
- let path = cache.filename_for_url(url);
- let mut newpath = temp.clone();
- newpath.push(base);
- assert_eq!(path.as_os_str(), newpath.as_os_str(), "{} -> {}", url, base);
- }
- for filename in vec!["go1.23.45.tar.gz", "test.html"] {
- let newname = String::from(filename) + HEADER_EXT;
- let mut newpath = temp.clone();
- newpath.set_file_name(filename);
- Cache::append_to_filename(&mut newpath, HEADER_EXT);
- assert_eq!(
- &newpath.file_name().unwrap().to_string_lossy().to_string(),
- &newname,
- "{} {}",
- filename,
- HEADER_EXT
- );
- // Test to make sure this removes HEADER_EXT from the filename.
- Cache::remove_from_filename(&mut newpath);
- assert_eq!(
- &newpath.file_name().unwrap().to_string_lossy().to_string(),
- filename,
- "{}",
- filename
- )
- }
- }
- #[test]
- #[cfg(not(feature = "local-httpbin"))]
- fn cache_fetch() {
- let mut dir = testdir!();
- dir.push("cache");
- // Make a copy of the cache directory PathBuf for verifying paths.
- let mut t = dir.clone();
- let cache = Cache::new(dir, None).unwrap();
- let r = cache.fetch("https://httpbin.org/anything");
- t.push("anything");
- if let Ok(r) = r {
- if let Status::Fetched(f) = r {
- assert!(f.exists(), "Cache file exists.");
- assert_eq!(f, t, "Cache path is what we were expecting.");
- let mut header_file = t.clone();
- Cache::append_to_filename(&mut header_file, HEADER_EXT);
- assert!(header_file.exists(), "Cache header file exists.");
- t.pop();
- t.push("anything.header");
- assert_eq!(header_file, t, "Cache header path is what we expected.");
- } else {
- panic!("Cache Status is not Status::Fetched, is: {:?}", r);
- }
- } else {
- panic!("cache.fetch: {:?}", r);
- }
- }
- /*
- Add to Config.toml:
- [features]
- local-httpbin = []
- Use:
- #[test]
- #[cfg(feature = "local-httpbin")]
- And then:
- cargo test -F local-httpbin -- --show-output
- This runs the local httpbin tests.
- */
- #[test]
- #[cfg(feature = "local-httpbin")]
- fn call_local() {
- let mut dir = testdir!();
- dir.push("cache");
- // Make a copy of the cache directory PathBuf for verifying paths.
- let mut t = dir.clone();
- let cache = Cache::new(dir, None).unwrap();
- let teapot_url = "http://127.0.0.1/status/418";
- let r = cache.fetch(&teapot_url);
- if let Err(e) = r {
- if let Error::HttpErrorStatus(code) = e {
- assert_eq!(code, 418);
- } else {
- panic!("Not an ErrorStatus");
- }
- } else {
- panic!("Unexpected error: {r:?}");
- }
- // println!("{:?}", r);
- let r = cache.fetch("http://127.0.0.1:1024");
- assert!(r.is_err(), "Confirm connection error");
- /*
- I disabled brotli in the Client builder.
- I get an error below about invalid UTF-8. The httpbin server isn't smart
- enough to see I don't support it, and sends it anyway. :(
- */
- /*
- let brot_url = "http://127.0.0.1/brotli";
- let r = cache.fetch(brot_url);
- println!("Brotli: {:?}", r);
- if let Status::Fetched(path) = r {
- let data = std::fs::read_to_string(path).unwrap();
- println!("DATA:\n{}", data);
- }
- */
- }
- /*
- These tests require a running local httpbin image.
- ```
- services:
- httpbin:
- image: kennethreitz/httpbin
- ports:
- - "80:80"
- ```
- */
- #[test]
- #[cfg(feature = "local-httpbin")]
- fn cache_local() {
- let mut dir = testdir!();
- dir.push("cache");
- // Make a copy of the cache directory PathBuf for verifying paths.
- let mut t = dir.clone();
- let cache = Cache::new(dir, None).unwrap();
- let etag_url = "http://127.0.0.1/etag/meow";
- let r = cache.fetch(&etag_url);
- if let Ok(r) = r {
- match r {
- Status::Fetched(_) => {}
- _ => {
- panic!("Expected Status::Fetched on 1st request.");
- }
- }
- } else {
- panic!("Unexpected error: {r:?}");
- }
- // 2nd call, the etag header is set.
- let r2 = cache.fetch(&etag_url);
- if let Ok(r2) = r2 {
- match r2 {
- Status::Cached(_) => {}
- _ => {
- panic!("Expected Status::Cached on 2nd request.");
- }
- }
- } else {
- panic!("Unexpected error: {r2:?}");
- }
- // println!("{:?}\n{:?}", r, r2);
- }
- }
|