From 73260c0e3506f5df301c1b44b4fb893fecf172ec Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Sun, 5 Nov 2023 14:55:37 -0500 Subject: [PATCH 01/19] walk: Send WorkerResults in batches --- src/dir_entry.rs | 2 + src/exec/job.rs | 31 +++++----- src/walk.rs | 156 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 137 insertions(+), 52 deletions(-) diff --git a/src/dir_entry.rs b/src/dir_entry.rs index 3a19d59..f44f2be 100644 --- a/src/dir_entry.rs +++ b/src/dir_entry.rs @@ -8,11 +8,13 @@ use lscolors::{Colorable, LsColors, Style}; use crate::config::Config; use crate::filesystem::strip_current_dir; +#[derive(Debug)] enum DirEntryInner { Normal(ignore::DirEntry), BrokenSymlink(PathBuf), } +#[derive(Debug)] pub struct DirEntry { inner: DirEntryInner, metadata: OnceCell>, diff --git a/src/exec/job.rs b/src/exec/job.rs index af603cc..4864d6d 100644 --- a/src/exec/job.rs +++ b/src/exec/job.rs @@ -1,9 +1,6 @@ use std::sync::Mutex; -use crossbeam_channel::Receiver; - use crate::config::Config; -use crate::dir_entry::DirEntry; use crate::error::print_error; use crate::exit_codes::{merge_exitcodes, ExitCode}; use crate::walk::WorkerResult; @@ -14,7 +11,7 @@ use super::CommandSet; /// generate a command with the supplied command template. The generated command will then /// be executed, and this process will continue until the receiver's sender has closed. pub fn job( - rx: Receiver, + results: impl IntoIterator, cmd: &CommandSet, out_perm: &Mutex<()>, config: &Config, @@ -22,35 +19,39 @@ pub fn job( // Output should be buffered when only running a single thread let buffer_output: bool = config.threads > 1; - let mut results: Vec = Vec::new(); - loop { + let mut ret = ExitCode::Success; + for result in results { // Obtain the next result from the receiver, else if the channel // has closed, exit from the loop - let dir_entry: DirEntry = match rx.recv() { - Ok(WorkerResult::Entry(dir_entry)) => dir_entry, - Ok(WorkerResult::Error(err)) => { + let dir_entry = match result { + WorkerResult::Entry(dir_entry) => dir_entry, + WorkerResult::Error(err) => { if config.show_filesystem_errors { print_error(err.to_string()); } continue; } - Err(_) => break, }; // Generate a command, execute it and store its exit code. - results.push(cmd.execute( + let code = cmd.execute( dir_entry.stripped_path(config), config.path_separator.as_deref(), out_perm, buffer_output, - )) + ); + ret = merge_exitcodes([ret, code]); } // Returns error in case of any error. - merge_exitcodes(results) + ret } -pub fn batch(rx: Receiver, cmd: &CommandSet, config: &Config) -> ExitCode { - let paths = rx +pub fn batch( + results: impl IntoIterator, + cmd: &CommandSet, + config: &Config, +) -> ExitCode { + let paths = results .into_iter() .filter_map(|worker_result| match worker_result { WorkerResult::Entry(dir_entry) => Some(dir_entry.into_stripped_path(config)), diff --git a/src/walk.rs b/src/walk.rs index 691c5d0..d0ecab8 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -4,12 +4,12 @@ use std::io::{self, Write}; use std::mem; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, MutexGuard}; use std::thread; use std::time::{Duration, Instant}; use anyhow::{anyhow, Result}; -use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, Sender}; +use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, SendError, Sender}; use etcetera::BaseStrategy; use ignore::overrides::{Override, OverrideBuilder}; use ignore::{self, WalkBuilder, WalkParallel, WalkState}; @@ -36,6 +36,7 @@ enum ReceiverMode { /// The Worker threads can result in a valid entry having PathBuf or an error. #[allow(clippy::large_enum_variant)] +#[derive(Debug)] pub enum WorkerResult { // Errors should be rare, so it's probably better to allow large_enum_variant than // to box the Entry variant @@ -43,6 +44,81 @@ pub enum WorkerResult { Error(ignore::Error), } +/// A batch of WorkerResults to send over a channel. +#[derive(Clone)] +struct Batch { + items: Arc>>>, +} + +impl Batch { + fn new() -> Self { + Self { + items: Arc::new(Mutex::new(Some(vec![]))), + } + } + + fn lock(&self) -> MutexGuard<'_, Option>> { + self.items.lock().unwrap() + } +} + +impl IntoIterator for Batch { + type Item = WorkerResult; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.lock().take().unwrap().into_iter() + } +} + +/// Wrapper that sends batches of items at once over a channel. +struct BatchSender { + batch: Batch, + tx: Sender, +} + +impl BatchSender { + fn new(tx: Sender) -> Self { + Self { + batch: Batch::new(), + tx, + } + } + + /// Check if we need to flush a batch. + fn needs_flush(batch: Option<&Vec>) -> bool { + match batch { + // Limit the batch size to provide some backpressure + Some(vec) => vec.len() >= 0x400, + // Batch was already taken by the receiver, so make a new one + None => true, + } + } + + /// Add an item to a batch. + fn send(&mut self, item: WorkerResult) -> Result<(), SendError<()>> { + let mut batch = self.batch.lock(); + + if Self::needs_flush(batch.as_ref()) { + drop(batch); + self.batch = Batch::new(); + batch = self.batch.lock(); + } + + let items = batch.as_mut().unwrap(); + items.push(item); + + if items.len() == 1 { + // New batch, send it over the channel + self.tx + .send(self.batch.clone()) + .map_err(|_| SendError(()))?; + } + + Ok(()) + } +} + /// Maximum size of the output buffer before flushing results to the console const MAX_BUFFER_LENGTH: usize = 1000; /// Default duration until output buffering switches to streaming. @@ -57,7 +133,7 @@ struct ReceiverBuffer<'a, W> { /// The ^C notifier. interrupt_flag: &'a AtomicBool, /// Receiver for worker results. - rx: Receiver, + rx: Receiver, /// Standard output. stdout: W, /// The current buffer mode. @@ -72,7 +148,7 @@ struct ReceiverBuffer<'a, W> { impl<'a, W: Write> ReceiverBuffer<'a, W> { /// Create a new receiver buffer. - fn new(state: &'a WorkerState, rx: Receiver, stdout: W) -> Self { + fn new(state: &'a WorkerState, rx: Receiver, stdout: W) -> Self { let config = &state.config; let quit_flag = state.quit_flag.as_ref(); let interrupt_flag = state.interrupt_flag.as_ref(); @@ -103,7 +179,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { } /// Receive the next worker result. - fn recv(&self) -> Result { + fn recv(&self) -> Result { match self.mode { ReceiverMode::Buffering => { // Wait at most until we should switch to streaming @@ -119,34 +195,40 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { /// Wait for a result or state change. fn poll(&mut self) -> Result<(), ExitCode> { match self.recv() { - Ok(WorkerResult::Entry(dir_entry)) => { - if self.config.quiet { - return Err(ExitCode::HasResults(true)); - } + Ok(batch) => { + for result in batch { + match result { + WorkerResult::Entry(dir_entry) => { + if self.config.quiet { + return Err(ExitCode::HasResults(true)); + } - match self.mode { - ReceiverMode::Buffering => { - self.buffer.push(dir_entry); - if self.buffer.len() > MAX_BUFFER_LENGTH { - self.stream()?; + match self.mode { + ReceiverMode::Buffering => { + self.buffer.push(dir_entry); + if self.buffer.len() > MAX_BUFFER_LENGTH { + self.stream()?; + } + } + ReceiverMode::Streaming => { + self.print(&dir_entry)?; + self.flush()?; + } + } + + self.num_results += 1; + if let Some(max_results) = self.config.max_results { + if self.num_results >= max_results { + return self.stop(); + } + } + } + WorkerResult::Error(err) => { + if self.config.show_filesystem_errors { + print_error(err.to_string()); + } } } - ReceiverMode::Streaming => { - self.print(&dir_entry)?; - self.flush()?; - } - } - - self.num_results += 1; - if let Some(max_results) = self.config.max_results { - if self.num_results >= max_results { - return self.stop(); - } - } - } - Ok(WorkerResult::Error(err)) => { - if self.config.show_filesystem_errors { - print_error(err.to_string()); } } Err(RecvTimeoutError::Timeout) => { @@ -319,13 +401,13 @@ impl WorkerState { /// Run the receiver work, either on this thread or a pool of background /// threads (for --exec). - fn receive(&self, rx: Receiver) -> ExitCode { + fn receive(&self, rx: Receiver) -> ExitCode { let config = &self.config; // This will be set to `Some` if the `--exec` argument was supplied. if let Some(ref cmd) = config.command { if cmd.in_batch_mode() { - exec::batch(rx, cmd, &config) + exec::batch(rx.into_iter().flatten(), cmd, &config) } else { let out_perm = Mutex::new(()); @@ -337,7 +419,8 @@ impl WorkerState { let rx = rx.clone(); // Spawn a job thread that will listen for and execute inputs. - let handle = scope.spawn(|| exec::job(rx, cmd, &out_perm, &config)); + let handle = scope + .spawn(|| exec::job(rx.into_iter().flatten(), cmd, &out_perm, &config)); // Push the handle of the spawned thread into the vector for later joining. handles.push(handle); @@ -355,12 +438,12 @@ impl WorkerState { } /// Spawn the sender threads. - fn spawn_senders(&self, walker: WalkParallel, tx: Sender) { + fn spawn_senders(&self, walker: WalkParallel, tx: Sender) { walker.run(|| { let patterns = &self.patterns; let config = &self.config; let quit_flag = self.quit_flag.as_ref(); - let tx = tx.clone(); + let mut tx = BatchSender::new(tx.clone()); Box::new(move |entry| { if quit_flag.load(Ordering::Relaxed) { @@ -545,8 +628,7 @@ impl WorkerState { .unwrap(); } - // Channel capacity was chosen empircally to perform similarly to an unbounded channel - let (tx, rx) = bounded(0x4000 * config.threads); + let (tx, rx) = bounded(config.threads); let exit_code = thread::scope(|scope| { // Spawn the receiver thread(s) From b8a5f95cf243eadc69a20029b582db01992fe878 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Wed, 8 Nov 2023 10:24:00 -0500 Subject: [PATCH 02/19] walk: Limit batch sizes in --exec mode --- src/walk.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/walk.rs b/src/walk.rs index d0ecab8..c6737ed 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -75,21 +75,23 @@ impl IntoIterator for Batch { struct BatchSender { batch: Batch, tx: Sender, + limit: usize, } impl BatchSender { - fn new(tx: Sender) -> Self { + fn new(tx: Sender, limit: usize) -> Self { Self { batch: Batch::new(), tx, + limit, } } /// Check if we need to flush a batch. - fn needs_flush(batch: Option<&Vec>) -> bool { + fn needs_flush(&self, batch: Option<&Vec>) -> bool { match batch { // Limit the batch size to provide some backpressure - Some(vec) => vec.len() >= 0x400, + Some(vec) => vec.len() >= self.limit, // Batch was already taken by the receiver, so make a new one None => true, } @@ -99,7 +101,7 @@ impl BatchSender { fn send(&mut self, item: WorkerResult) -> Result<(), SendError<()>> { let mut batch = self.batch.lock(); - if Self::needs_flush(batch.as_ref()) { + if self.needs_flush(batch.as_ref()) { drop(batch); self.batch = Batch::new(); batch = self.batch.lock(); @@ -443,7 +445,15 @@ impl WorkerState { let patterns = &self.patterns; let config = &self.config; let quit_flag = self.quit_flag.as_ref(); - let mut tx = BatchSender::new(tx.clone()); + + let mut limit = 0x100; + if let Some(cmd) = &config.command { + if !cmd.in_batch_mode() && config.threads > 1 { + // Evenly distribute work between multiple receivers + limit = 1; + } + } + let mut tx = BatchSender::new(tx.clone(), limit); Box::new(move |entry| { if quit_flag.load(Ordering::Relaxed) { @@ -628,7 +638,7 @@ impl WorkerState { .unwrap(); } - let (tx, rx) = bounded(config.threads); + let (tx, rx) = bounded(2 * config.threads); let exit_code = thread::scope(|scope| { // Spawn the receiver thread(s) From 4b4a74c988197a1762f3e19325a929f0d5678d06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:25:38 +0000 Subject: [PATCH 03/19] build(deps): bump clap from 4.4.7 to 4.4.10 Bumps [clap](https://github.com/clap-rs/clap) from 4.4.7 to 4.4.10. - [Release notes](https://github.com/clap-rs/clap/releases) - [Changelog](https://github.com/clap-rs/clap/blob/master/CHANGELOG.md) - [Commits](https://github.com/clap-rs/clap/compare/v4.4.7...v4.4.10) --- updated-dependencies: - dependency-name: clap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 28843dc..8f8c070 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.4.7" +version = "4.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac495e00dcec98c83465d5ad66c5c4fabd652fd6686e7c6269b117e729a6f17b" +checksum = "41fffed7514f420abec6d183b1d3acfd9099c79c3a10a06ade4f8203f1411272" dependencies = [ "clap_builder", "clap_derive", @@ -164,9 +164,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.7" +version = "4.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c77ed9a32a62e6ca27175d00d29d05ca32e396ea1eb5fb01d8256b669cec7663" +checksum = "63361bae7eef3771745f02d8d892bec2fee5f6e34af316ba556e7f97a7069ff1" dependencies = [ "anstream", "anstyle", diff --git a/Cargo.toml b/Cargo.toml index 4f2f464..2297ddd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,7 +51,7 @@ clap_complete = {version = "4.4.4", optional = true} faccess = "0.2.4" [dependencies.clap] -version = "4.4.7" +version = "4.4.10" features = ["suggestions", "color", "wrap_help", "cargo", "derive"] [dependencies.chrono] From 0853e35e1fcf6ca61797515dd71a3c643f401d67 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:25:28 +0000 Subject: [PATCH 04/19] build(deps): bump libc from 0.2.149 to 0.2.150 Bumps [libc](https://github.com/rust-lang/libc) from 0.2.149 to 0.2.150. - [Release notes](https://github.com/rust-lang/libc/releases) - [Commits](https://github.com/rust-lang/libc/compare/0.2.149...0.2.150) --- updated-dependencies: - dependency-name: libc dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 28843dc..f2be405 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -465,9 +465,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" [[package]] name = "linux-raw-sys" From e1ecba2ce49470f8106bec3fd75bba2b8a322a4e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:25:20 +0000 Subject: [PATCH 05/19] build(deps): bump lscolors from 0.15.0 to 0.16.0 Bumps [lscolors](https://github.com/sharkdp/lscolors) from 0.15.0 to 0.16.0. - [Release notes](https://github.com/sharkdp/lscolors/releases) - [Commits](https://github.com/sharkdp/lscolors/compare/v0.15.0...0.16.0) --- updated-dependencies: - dependency-name: lscolors dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f2be405..50b66c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -483,9 +483,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "lscolors" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf7015a04103ad78abb77e4b79ed151e767922d1cfde5f62640471c629a2320d" +checksum = "ab0b209ec3976527806024406fe765474b9a1750a0ed4b8f0372364741f50e7b" dependencies = [ "nu-ansi-term", ] diff --git a/Cargo.toml b/Cargo.toml index 4f2f464..eb5e785 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,7 @@ default-features = false features = ["std", "clock"] [dependencies.lscolors] -version = "0.15" +version = "0.16" default-features = false features = ["nu-ansi-term"] From 4202f3939ef12cafcfe93cb049643d3e267bcb49 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:25:13 +0000 Subject: [PATCH 06/19] build(deps): bump test-case from 3.2.1 to 3.3.1 Bumps [test-case](https://github.com/frondeus/test-case) from 3.2.1 to 3.3.1. - [Release notes](https://github.com/frondeus/test-case/releases) - [Changelog](https://github.com/frondeus/test-case/blob/master/CHANGELOG.md) - [Commits](https://github.com/frondeus/test-case/compare/v3.2.1...v3.3.1) --- updated-dependencies: - dependency-name: test-case dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50b66c3..ebcf7fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -749,9 +749,9 @@ dependencies = [ [[package]] name = "test-case" -version = "3.2.1" +version = "3.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8f1e820b7f1d95a0cdbf97a5df9de10e1be731983ab943e56703ac1b8e9d425" +checksum = "eb2550dd13afcd286853192af8601920d959b14c401fcece38071d53bf0768a8" dependencies = [ "test-case-macros", ] diff --git a/Cargo.toml b/Cargo.toml index eb5e785..85824ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,7 +80,7 @@ jemallocator = {version = "0.5.4", optional = true} diff = "0.1" tempfile = "3.8" filetime = "0.2" -test-case = "3.1" +test-case = "3.3" [profile.release] lto = true From 00b64f3ccbfb832ef744bb42bbdfabaf929b5ee2 Mon Sep 17 00:00:00 2001 From: David Peter Date: Fri, 8 Dec 2023 13:37:03 +0100 Subject: [PATCH 07/19] Suggest to use `-u` instead of `-HI` --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dc16cb6..3611e87 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ target/debug/deps/libnum_cpus-f5ce7ef99006aa05.rlib ``` To really search *all* files and directories, simply combine the hidden and ignore features to show -everything (`-HI`). +everything (`-HI`) or use `-u`/`--unrestricted`. ### Matching the full path By default, *fd* only matches the filename of each file. However, using the `--full-path` or `-p` option, @@ -406,9 +406,9 @@ for alternative, more complete (or more colorful) variants, see [here](https://g Remember that `fd` ignores hidden directories and files by default. It also ignores patterns from `.gitignore` files. If you want to make sure to find absolutely every possible file, always -use the options `-H` and `-I` to disable these two features: +use the options `-u`/`--unrestricted` option (or `-HI` to enable hidden and ignored files): ``` bash -> fd -HI … +> fd -u … ``` ### `fd` doesn't seem to interpret my regex pattern correctly From fea16227248fe806095ab69ede11f1031c2f40e6 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Wed, 29 Nov 2023 16:53:24 -0500 Subject: [PATCH 08/19] cli: Tweak default thread count logic --- CHANGELOG.md | 8 +++++--- src/cli.rs | 24 +++++++----------------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32667aa..31ca18f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,15 +5,17 @@ - Breaking: `.git/` is now ignored by default when using `--hidden` / `-H`, use `--no-ignore` / `-I` or `--no-ignore-vcs` to override, see #1387 and #1396 (@skoriop) - ## Bugfixes - Fix `NO_COLOR` support, see #1421 (@acuteenvy) ## Changes -- The default number of threads is now constrained to be at most 16. This should improve startup time on - systems with many CPU cores. (#1203) +- Performance has been significantly improved, both due to optimizations in the underlying `ignore` + crate (#1429), and in `fd` itself (#1422). + +- The default number of threads is now constrained to be at most 64. This should improve startup time on + systems with many CPU cores. (#1203, #1412, #1431) ## Other diff --git a/src/cli.rs b/src/cli.rs index 1b02288..3bd8d84 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -715,24 +715,14 @@ impl Opts { fn default_num_threads() -> NonZeroUsize { // If we can't get the amount of parallelism for some reason, then // default to a single thread, because that is safe. - // Note that the minimum value for a NonZeroUsize is 1. - // Unfortunately, we can't do `NonZeroUsize::new(1).unwrap()` - // in a const context. - const FALLBACK_PARALLELISM: NonZeroUsize = NonZeroUsize::MIN; - // As the number of threads increases, the startup time suffers from - // initializing the threads, and we get diminishing returns from additional - // parallelism. So set a maximum number of threads to use by default. - // - // This value is based on some empirical observations, but the ideal value - // probably depends on the exact hardware in use. - // - // Safety: The literal "20" is known not to be zero. - const MAX_DEFAULT_THREADS: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(20) }; + let fallback = NonZeroUsize::MIN; + // To limit startup overhead on massively parallel machines, don't use more + // than 64 threads. + let limit = NonZeroUsize::new(64).unwrap(); - std::cmp::min( - std::thread::available_parallelism().unwrap_or(FALLBACK_PARALLELISM), - MAX_DEFAULT_THREADS, - ) + std::thread::available_parallelism() + .unwrap_or(fallback) + .min(limit) } #[derive(Copy, Clone, PartialEq, Eq, ValueEnum)] From 16c2d1e1d05e815ef5b564cb7710b80853b75066 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Wed, 13 Dec 2023 14:23:14 -0500 Subject: [PATCH 09/19] walk: Flush stdout in batches The previous behaviour was designed to mimic the output buffering of typical UNIX tools: line-buffered if stdout is a TTY, and fully-buffered otherwise. More precicely, when printing to a terminal, fd would flush explicitly after printing any buffered results, then flush after every single result once streaming mode started. When not printing to a terminal, fd never explicitly flushed, so writes would only happen as the BufWriter filled up. The new behaviour actually unifies the TTY and non-TTY cases: we flush after printing the buffered results, then once we start streaming, we flush after each batch, but *only when the channel is empty*. This provides a good balance: if the channel is empty, the receiver thread might as well flush before it goes to sleep waiting for more results. If the channel is non-empty, we might as well process those results before deciding to flush. For TTYs, this should improve performance by consolidating write() calls without sacrificing interactivity. For non-TTYs, we'll be flushing more often, but only when the receiver would otherwise have nothing to do, thus improving interactivity without sacrificing performance. This is particularly handy when fd is piped into another command (such as head or grep): with the old behaviour, fd could wait for the whole traversal to finish before printing anything. With the new behaviour, fd will print those results soon after they are received. Fixes #1313. --- src/walk.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/walk.rs b/src/walk.rs index c6737ed..c81d2a4 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -214,7 +214,6 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { } ReceiverMode::Streaming => { self.print(&dir_entry)?; - self.flush()?; } } @@ -232,6 +231,11 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { } } } + + // If we don't have another batch ready, flush before waiting + if self.mode == ReceiverMode::Streaming && self.rx.is_empty() { + self.flush()?; + } } Err(RecvTimeoutError::Timeout) => { self.stream()?; @@ -285,7 +289,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { /// Flush stdout if necessary. fn flush(&mut self) -> Result<(), ExitCode> { - if self.config.interactive_terminal && self.stdout.flush().is_err() { + if self.stdout.flush().is_err() { // Probably a broken pipe. Exit gracefully. return Err(ExitCode::GeneralError); } From e3b40208d5cc3e0469f703637475a344aafdc772 Mon Sep 17 00:00:00 2001 From: David Peter Date: Tue, 19 Dec 2023 10:46:26 +0100 Subject: [PATCH 10/19] Preprations for fd 9 release --- CHANGELOG.md | 32 +++++++++++++++++++++++--------- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 8 ++++---- doc/release-checklist.md | 2 +- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 31ca18f..2da02ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,21 @@ -# Upcoming release +# v9.0.0 + +## Performance + +- Performance has been *significantly improved*, both due to optimizations in the underlying `ignore` + crate (#1429), and in `fd` itself (#1422, #1408, #13620) - @tavianator. + [Benchmarks results](https://gist.github.com/tavianator/32edbe052f33ef60570cf5456b59de81) show gains + of 6-8x for full traversals of smaller directories (100k files) and up to 13x for larger directories (1M files). + +- The default number of threads is now constrained to be at most 64. This should improve startup time on + systems with many CPU cores. (#1203, #1410, #1412, #1431) - @tmccombs and @tavianator + +- New flushing behavior when writing output to stdout, providing better performance for TTY and non-TTY + use cases, see #1452 and #1313 (@tavianator). ## Features +- Support character and block device file types, see #1213 and #1336 (@cgzones) - Breaking: `.git/` is now ignored by default when using `--hidden` / `-H`, use `--no-ignore` / `-I` or `--no-ignore-vcs` to override, see #1387 and #1396 (@skoriop) @@ -9,16 +23,16 @@ - Fix `NO_COLOR` support, see #1421 (@acuteenvy) -## Changes - -- Performance has been significantly improved, both due to optimizations in the underlying `ignore` - crate (#1429), and in `fd` itself (#1422). - -- The default number of threads is now constrained to be at most 64. This should improve startup time on - systems with many CPU cores. (#1203, #1412, #1431) - ## Other +- Fixed documentation typos, see #1409 (@marcospb19) + +## Thanks + +Special thanks to @tavianator for his incredible work on performance in the `ignore` crate and `fd` itself. + + + # v8.7.1 ## Bugfixes diff --git a/Cargo.lock b/Cargo.lock index c28b0eb..cb159cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -313,7 +313,7 @@ checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fd-find" -version = "8.7.1" +version = "9.0.0" dependencies = [ "aho-corasick", "anyhow", diff --git a/Cargo.toml b/Cargo.toml index 77c1d09..6ca9f90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ license = "MIT OR Apache-2.0" name = "fd-find" readme = "README.md" repository = "https://github.com/sharkdp/fd" -version = "8.7.1" +version = "9.0.0" edition= "2021" rust-version = "1.70.0" diff --git a/README.md b/README.md index 3611e87..d8e1327 100644 --- a/README.md +++ b/README.md @@ -314,8 +314,8 @@ Options: -d, --max-depth Set maximum search depth (default: none) -E, --exclude Exclude entries that match the given glob pattern -t, --type Filter by type: file (f), directory (d), symlink (l), - executable (x), empty (e), socket (s), pipe (p), - block-device (b), char-device (c) + executable (x), empty (e), socket (s), pipe (p), char-device + (c), block-device (b) -e, --extension Filter by file extension -S, --size Limit results based on the size of files --changed-within Filter by file modification time (newer than) @@ -543,7 +543,7 @@ Make sure that `$HOME/.local/bin` is in your `$PATH`. If you use an older version of Ubuntu, you can download the latest `.deb` package from the [release page](https://github.com/sharkdp/fd/releases) and install it via: ``` bash -sudo dpkg -i fd_8.7.1_amd64.deb # adapt version number and architecture +sudo dpkg -i fd_9.0.0_amd64.deb # adapt version number and architecture ``` ### On Debian @@ -677,7 +677,7 @@ With Rust's package manager [cargo](https://github.com/rust-lang/cargo), you can ``` cargo install fd-find ``` -Note that rust version *1.64.0* or later is required. +Note that rust version *1.70.0* or later is required. `make` is also needed for the build. diff --git a/doc/release-checklist.md b/doc/release-checklist.md index b11e75d..0a20802 100644 --- a/doc/release-checklist.md +++ b/doc/release-checklist.md @@ -9,7 +9,7 @@ necessary changes for the upcoming release. - [ ] Update version in `Cargo.toml`. Run `cargo build` to update `Cargo.lock`. Make sure to `git add` the `Cargo.lock` changes as well. - [ ] Find the current min. supported Rust version by running - `grep '^\s*MIN_SUPPORTED_RUST_VERSION' .github/workflows/CICD.yml`. + `grep rust-version Cargo.toml`. - [ ] Update the `fd` version and the min. supported Rust version in `README.md`. - [ ] Update `CHANGELOG.md`. Change the heading of the *"Upcoming release"* section to the version of this release. From 61ebd9be6a0c0e380028509be9553aac7931fe08 Mon Sep 17 00:00:00 2001 From: David Peter Date: Tue, 19 Dec 2023 11:10:54 +0100 Subject: [PATCH 11/19] Update benchmark results --- README.md | 62 +++++++++++++++++++++---------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index d8e1327..2e89126 100644 --- a/README.md +++ b/README.md @@ -331,61 +331,45 @@ Options: ## Benchmark -Let's search my home folder for files that end in `[0-9].jpg`. It contains ~190.000 -subdirectories and about a million files. For averaging and statistical analysis, I'm using +Let's search my home folder for files that end in `[0-9].jpg`. It contains ~750.000 +subdirectories and about a 4 million files. For averaging and statistical analysis, I'm using [hyperfine](https://github.com/sharkdp/hyperfine). The following benchmarks are performed with a "warm"/pre-filled disk-cache (results for a "cold" disk-cache show the same trends). Let's start with `find`: ``` -Benchmark #1: find ~ -iregex '.*[0-9]\.jpg$' - - Time (mean ± σ): 7.236 s ± 0.090 s - - Range (min … max): 7.133 s … 7.385 s +Benchmark 1: find ~ -iregex '.*[0-9]\.jpg$' + Time (mean ± σ): 19.922 s ± 0.109 s + Range (min … max): 19.765 s … 20.065 s ``` `find` is much faster if it does not need to perform a regular-expression search: ``` -Benchmark #2: find ~ -iname '*[0-9].jpg' - - Time (mean ± σ): 3.914 s ± 0.027 s - - Range (min … max): 3.876 s … 3.964 s +Benchmark 2: find ~ -iname '*[0-9].jpg' + Time (mean ± σ): 11.226 s ± 0.104 s + Range (min … max): 11.119 s … 11.466 s ``` -Now let's try the same for `fd`. Note that `fd` *always* performs a regular expression -search. The options `--hidden` and `--no-ignore` are needed for a fair comparison, -otherwise `fd` does not have to traverse hidden folders and ignored paths (see below): +Now let's try the same for `fd`. Note that `fd` performs a regular expression +search by defautl. The options `-u`/`--unrestricted` option is needed here for +a fair comparison. Otherwise `fd` does not have to traverse hidden folders and +ignored paths (see below): ``` -Benchmark #3: fd -HI '.*[0-9]\.jpg$' ~ - - Time (mean ± σ): 811.6 ms ± 26.9 ms - - Range (min … max): 786.0 ms … 870.7 ms +Benchmark 3: fd -u '[0-9]\.jpg$' ~ + Time (mean ± σ): 854.8 ms ± 10.0 ms + Range (min … max): 839.2 ms … 868.9 ms ``` -For this particular example, `fd` is approximately nine times faster than `find -iregex` -and about five times faster than `find -iname`. By the way, both tools found the exact -same 20880 files :smile:. +For this particular example, `fd` is approximately **23 times faster** than `find -iregex` +and about **13 times faster** than `find -iname`. By the way, both tools found the exact +same 546 files :smile:. -Finally, let's run `fd` without `--hidden` and `--no-ignore` (this can lead to different -search results, of course). If *fd* does not have to traverse the hidden and git-ignored -folders, it is almost an order of magnitude faster: -``` -Benchmark #4: fd '[0-9]\.jpg$' ~ - - Time (mean ± σ): 123.7 ms ± 6.0 ms - - Range (min … max): 118.8 ms … 140.0 ms -``` - -**Note**: This is *one particular* benchmark on *one particular* machine. While I have -performed quite a lot of different tests (and found consistent results), things might -be different for you! I encourage everyone to try it out on their own. See +**Note**: This is *one particular* benchmark on *one particular* machine. While we have +performed a lot of different tests (and found consistent results), things might +be different for you! We encourage everyone to try it out on their own. See [this repository](https://github.com/sharkdp/fd-benchmarks) for all necessary scripts. -Concerning *fd*'s speed, the main credit goes to the `regex` and `ignore` crates that are also used -in [ripgrep](https://github.com/BurntSushi/ripgrep) (check it out!). +Concerning *fd*'s speed, a lot of credit goes to the `regex` and `ignore` crates that are +also used in [ripgrep](https://github.com/BurntSushi/ripgrep) (check it out!). ## Troubleshooting From d9c4e6239fc1807bce1bb6aca4426f3880230a84 Mon Sep 17 00:00:00 2001 From: David Peter Date: Tue, 19 Dec 2023 11:20:17 +0100 Subject: [PATCH 12/19] Fix names for ARM Debian packages --- .github/workflows/CICD.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CICD.yml b/.github/workflows/CICD.yml index e3182b3..228dbb4 100644 --- a/.github/workflows/CICD.yml +++ b/.github/workflows/CICD.yml @@ -209,7 +209,7 @@ jobs: DPKG_BASENAME=${{ needs.crate_metadata.outputs.name }} DPKG_CONFLICTS=${{ needs.crate_metadata.outputs.name }}-musl - case ${{ matrix.job.target }} in *-musl) DPKG_BASENAME=${{ needs.crate_metadata.outputs.name }}-musl ; DPKG_CONFLICTS=${{ needs.crate_metadata.outputs.name }} ;; esac; + case ${{ matrix.job.target }} in *-musl*) DPKG_BASENAME=${{ needs.crate_metadata.outputs.name }}-musl ; DPKG_CONFLICTS=${{ needs.crate_metadata.outputs.name }} ;; esac; DPKG_VERSION=${{ needs.crate_metadata.outputs.version }} unset DPKG_ARCH From 13a93e5cbe78b0e7d7c8891f9619eca499f65721 Mon Sep 17 00:00:00 2001 From: David Peter Date: Tue, 19 Dec 2023 11:42:27 +0100 Subject: [PATCH 13/19] Add new unreleased section --- CHANGELOG.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2da02ec..8723e40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +# Upcoming release + +## Features + + +## Bugfixes + + +## Changes + + +## Other + + + + # v9.0.0 ## Performance From a03ed8b300ec047f17c1381c21faa415095840b9 Mon Sep 17 00:00:00 2001 From: David Peter Date: Tue, 19 Dec 2023 11:44:20 +0100 Subject: [PATCH 14/19] Update license information --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 2e89126..2db1312 100644 --- a/README.md +++ b/README.md @@ -692,8 +692,6 @@ cargo install --path . ## License -Copyright (c) 2017-2021 The fd developers - `fd` is distributed under the terms of both the MIT License and the Apache License 2.0. See the [LICENSE-APACHE](LICENSE-APACHE) and [LICENSE-MIT](LICENSE-MIT) files for license details. From 07343b5baff84bc120c951b352166782e9fcdda1 Mon Sep 17 00:00:00 2001 From: David Peter Date: Tue, 19 Dec 2023 11:45:09 +0100 Subject: [PATCH 15/19] Update troubleshooting section --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2db1312..c91ff98 100644 --- a/README.md +++ b/README.md @@ -373,6 +373,15 @@ also used in [ripgrep](https://github.com/BurntSushi/ripgrep) (check it out!). ## Troubleshooting +### `fd` does not find my file! + +Remember that `fd` ignores hidden directories and files by default. It also ignores patterns +from `.gitignore` files. If you want to make sure to find absolutely every possible file, always +use the options `-u`/`--unrestricted` option (or `-HI` to enable hidden and ignored files): +``` bash +> fd -u … +``` + ### Colorized output `fd` can colorize files by extension, just like `ls`. In order for this to work, the environment @@ -386,15 +395,6 @@ for alternative, more complete (or more colorful) variants, see [here](https://g `fd` also honors the [`NO_COLOR`](https://no-color.org/) environment variable. -### `fd` does not find my file! - -Remember that `fd` ignores hidden directories and files by default. It also ignores patterns -from `.gitignore` files. If you want to make sure to find absolutely every possible file, always -use the options `-u`/`--unrestricted` option (or `-HI` to enable hidden and ignored files): -``` bash -> fd -u … -``` - ### `fd` doesn't seem to interpret my regex pattern correctly A lot of special regex characters (like `[]`, `^`, `$`, ..) are also special characters in your From 954a3900b9df556b882ad0e7ec1405a19eab0985 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Tue, 19 Dec 2023 14:04:01 -0500 Subject: [PATCH 16/19] CHANGELOG: Fix issue number typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8723e40..fa857d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ ## Performance - Performance has been *significantly improved*, both due to optimizations in the underlying `ignore` - crate (#1429), and in `fd` itself (#1422, #1408, #13620) - @tavianator. + crate (#1429), and in `fd` itself (#1422, #1408, #1362) - @tavianator. [Benchmarks results](https://gist.github.com/tavianator/32edbe052f33ef60570cf5456b59de81) show gains of 6-8x for full traversals of smaller directories (100k files) and up to 13x for larger directories (1M files). From 9529f30129e2696bc4a4027561e66c6de69b3195 Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Sat, 23 Dec 2023 01:09:36 -0700 Subject: [PATCH 17/19] fix: Respect NO_COLOR with --list-details option Fixes: #1455 --- CHANGELOG.md | 2 ++ src/cli.rs | 11 ----------- src/main.rs | 10 +++++++--- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa857d1..e852038 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ ## Bugfixes +- Respect NO_COLOR environment variable with `--list-details` option. (#1455) + ## Changes diff --git a/src/cli.rs b/src/cli.rs index 3bd8d84..0007a31 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -758,17 +758,6 @@ pub enum ColorWhen { Never, } -impl ColorWhen { - pub fn as_str(&self) -> &'static str { - use ColorWhen::*; - match *self { - Auto => "auto", - Never => "never", - Always => "always", - } - } -} - // there isn't a derive api for getting grouped values yet, // so we have to use hand-rolled parsing for exec and exec-batch pub struct Exec { diff --git a/src/main.rs b/src/main.rs index 5440601..bef4120 100644 --- a/src/main.rs +++ b/src/main.rs @@ -325,18 +325,22 @@ fn extract_command(opts: &mut Opts, colored_output: bool) -> Result Result> { +fn determine_ls_command(colored_output: bool) -> Result> { #[allow(unused)] let gnu_ls = |command_name| { + let color_arg = if colored_output { + "--color=always" + } else { + "--color=never" + }; // Note: we use short options here (instead of --long-options) to support more // platforms (like BusyBox). vec![ From aeb4a5fdad1109b8ef1cddfd0ad347bceef38ed7 Mon Sep 17 00:00:00 2001 From: Sayan Goswami Date: Thu, 28 Dec 2023 00:52:18 +0530 Subject: [PATCH 18/19] Fixes typo in README Fixes a tiny typo: ~defautl~ to default. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c91ff98..e2de9d1 100644 --- a/README.md +++ b/README.md @@ -351,7 +351,7 @@ Benchmark 2: find ~ -iname '*[0-9].jpg' ``` Now let's try the same for `fd`. Note that `fd` performs a regular expression -search by defautl. The options `-u`/`--unrestricted` option is needed here for +search by default. The options `-u`/`--unrestricted` option is needed here for a fair comparison. Otherwise `fd` does not have to traverse hidden folders and ignored paths (see below): ``` From 5cd15536b668c206ce994fc9b28da25f2cf430d4 Mon Sep 17 00:00:00 2001 From: Roshan Jossy Date: Thu, 28 Dec 2023 00:50:02 +0100 Subject: [PATCH 19/19] Format notes in Readme --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e2de9d1..212f577 100644 --- a/README.md +++ b/README.md @@ -261,7 +261,9 @@ To make exclude-patterns like these permanent, you can create a `.fdignore` file /mnt/external-drive *.bak ``` -Note: `fd` also supports `.ignore` files that are used by other programs such as `rg` or `ag`. + +> [!NOTE] +> `fd` also supports `.ignore` files that are used by other programs such as `rg` or `ag`. If you want `fd` to ignore these patterns globally, you can put them in `fd`'s global ignore file. This is usually located in `~/.config/fd/ignore` in macOS or Linux, and `%APPDATA%\fd\ignore` in @@ -284,7 +286,8 @@ option: If you also want to remove a certain class of directories, you can use the same technique. You will have to use `rm`s `--recursive`/`-r` flag to remove directories. -Note: there are scenarios where using `fd … -X rm -r` can cause race conditions: if you have a +> [!NOTE] +> There are scenarios where using `fd … -X rm -r` can cause race conditions: if you have a path like `…/foo/bar/foo/…` and want to remove all directories named `foo`, you can end up in a situation where the outer `foo` directory is removed first, leading to (harmless) *"'foo/bar/foo': No such file or directory"* errors in the `rm` call.