diff --git a/.github/workflows/CICD.yml b/.github/workflows/CICD.yml
index e3182b3..228dbb4 100644
--- a/.github/workflows/CICD.yml
+++ b/.github/workflows/CICD.yml
@@ -209,7 +209,7 @@ jobs:
 
         DPKG_BASENAME=${{ needs.crate_metadata.outputs.name }}
         DPKG_CONFLICTS=${{ needs.crate_metadata.outputs.name }}-musl
-        case ${{ matrix.job.target }} in *-musl) DPKG_BASENAME=${{ needs.crate_metadata.outputs.name }}-musl ; DPKG_CONFLICTS=${{ needs.crate_metadata.outputs.name }} ;; esac;
+        case ${{ matrix.job.target }} in *-musl*) DPKG_BASENAME=${{ needs.crate_metadata.outputs.name }}-musl ; DPKG_CONFLICTS=${{ needs.crate_metadata.outputs.name }} ;; esac;
         DPKG_VERSION=${{ needs.crate_metadata.outputs.version }}
 
         unset DPKG_ARCH
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32667aa..e852038 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,21 +2,55 @@
 
 ## Features
 
+
+## Bugfixes
+
+- Respect NO_COLOR environment variable with `--list-details` option. (#1455)
+
+
+## Changes
+
+
+## Other
+
+
+
+
+# v9.0.0
+
+## Performance
+
+- Performance has been *significantly improved*, both due to optimizations in the underlying `ignore`
+  crate (#1429), and in `fd` itself (#1422, #1408, #1362) - @tavianator.
+  [Benchmarks results](https://gist.github.com/tavianator/32edbe052f33ef60570cf5456b59de81) show gains
+  of 6-8x for full traversals of smaller directories (100k files) and up to 13x for larger directories (1M files).
+
+- The default number of threads is now constrained to be at most 64. This should improve startup time on
+  systems with many CPU cores. (#1203, #1410, #1412, #1431) - @tmccombs and @tavianator
+
+- New flushing behavior when writing output to stdout, providing better performance for TTY and non-TTY
+  use cases, see #1452 and #1313 (@tavianator).
+
+## Features
+
+- Support character and block device file types, see #1213 and #1336 (@cgzones)
 - Breaking: `.git/` is now ignored by default when using `--hidden` / `-H`, use `--no-ignore` / `-I` or
   `--no-ignore-vcs` to override, see #1387 and #1396 (@skoriop)
 
-
 ## Bugfixes
 
 - Fix `NO_COLOR` support, see #1421 (@acuteenvy)
 
-## Changes
-
-- The default number of threads is now constrained to be at most 16. This should improve startup time on
-  systems with many CPU cores. (#1203)
-
 ## Other
 
+- Fixed documentation typos, see #1409 (@marcospb19)
+
+## Thanks
+
+Special thanks to @tavianator for his incredible work on performance in the `ignore` crate and `fd` itself.
+
+
+
 # v8.7.1
 
 ## Bugfixes
diff --git a/Cargo.lock b/Cargo.lock
index 28843dc..cb159cb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -154,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.4.7"
+version = "4.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac495e00dcec98c83465d5ad66c5c4fabd652fd6686e7c6269b117e729a6f17b"
+checksum = "41fffed7514f420abec6d183b1d3acfd9099c79c3a10a06ade4f8203f1411272"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -164,9 +164,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.4.7"
+version = "4.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c77ed9a32a62e6ca27175d00d29d05ca32e396ea1eb5fb01d8256b669cec7663"
+checksum = "63361bae7eef3771745f02d8d892bec2fee5f6e34af316ba556e7f97a7069ff1"
 dependencies = [
  "anstream",
  "anstyle",
@@ -313,7 +313,7 @@ checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
 [[package]]
 name = "fd-find"
-version = "8.7.1"
+version = "9.0.0"
 dependencies = [
  "aho-corasick",
  "anyhow",
@@ -465,9 +465,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
 
 [[package]]
 name = "linux-raw-sys"
@@ -483,9 +483,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
 [[package]]
 name = "lscolors"
-version = "0.15.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf7015a04103ad78abb77e4b79ed151e767922d1cfde5f62640471c629a2320d"
+checksum = "ab0b209ec3976527806024406fe765474b9a1750a0ed4b8f0372364741f50e7b"
 dependencies = [
  "nu-ansi-term",
 ]
@@ -749,9 +749,9 @@ dependencies = [
 
 [[package]]
 name = "test-case"
-version = "3.2.1"
+version = "3.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8f1e820b7f1d95a0cdbf97a5df9de10e1be731983ab943e56703ac1b8e9d425"
+checksum = "eb2550dd13afcd286853192af8601920d959b14c401fcece38071d53bf0768a8"
 dependencies = [
  "test-case-macros",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 4f2f464..6ca9f90 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@ license = "MIT OR Apache-2.0"
 name = "fd-find"
 readme = "README.md"
 repository = "https://github.com/sharkdp/fd"
-version = "8.7.1"
+version = "9.0.0"
 edition= "2021"
 rust-version = "1.70.0"
 
@@ -51,7 +51,7 @@ clap_complete = {version = "4.4.4", optional = true}
 faccess = "0.2.4"
 
 [dependencies.clap]
-version = "4.4.7"
+version = "4.4.10"
 features = ["suggestions", "color", "wrap_help", "cargo", "derive"]
 
 [dependencies.chrono]
@@ -60,7 +60,7 @@ default-features = false
 features = ["std", "clock"]
 
 [dependencies.lscolors]
-version = "0.15"
+version = "0.16"
 default-features = false
 features = ["nu-ansi-term"]
 
@@ -80,7 +80,7 @@ jemallocator = {version = "0.5.4", optional = true}
 diff = "0.1"
 tempfile = "3.8"
 filetime = "0.2"
-test-case = "3.1"
+test-case = "3.3"
 
 [profile.release]
 lto = true
diff --git a/README.md b/README.md
index 9e71d63..18cbdb0 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ target/debug/deps/libnum_cpus-f5ce7ef99006aa05.rlib
 ```
 
 To really search *all* files and directories, simply combine the hidden and ignore features to show
-everything (`-HI`).
+everything (`-HI`) or use `-u`/`--unrestricted`.
 
 ### Matching the full path
 By default, *fd* only matches the filename of each file. However, using the `--full-path` or `-p` option,
@@ -261,7 +261,9 @@ To make exclude-patterns like these permanent, you can create a `.fdignore` file
 /mnt/external-drive
 *.bak
 ```
-Note: `fd` also supports `.ignore` files that are used by other programs such as `rg` or `ag`.
+
+> [!NOTE]
+> `fd` also supports `.ignore` files that are used by other programs such as `rg` or `ag`.
 
 If you want `fd` to ignore these patterns globally, you can put them in `fd`'s global ignore file.
 This is usually located in `~/.config/fd/ignore` in macOS or Linux, and `%APPDATA%\fd\ignore` in
@@ -284,7 +286,8 @@ option:
 If you also want to remove a certain class of directories, you can use the same technique. You will
 have to use `rm`s `--recursive`/`-r` flag to remove directories.
 
-Note: there are scenarios where using `fd … -X rm -r` can cause race conditions: if you have a
+> [!NOTE]
+> There are scenarios where using `fd … -X rm -r` can cause race conditions: if you have a
 path like `…/foo/bar/foo/…` and want to remove all directories named `foo`, you can end up in a
 situation where the outer `foo` directory is removed first, leading to (harmless) *"'foo/bar/foo':
 No such file or directory"* errors in the `rm` call.
@@ -331,64 +334,57 @@ Options:
 
 ## Benchmark
 
-Let's search my home folder for files that end in `[0-9].jpg`. It contains ~190.000
-subdirectories and about a million files. For averaging and statistical analysis, I'm using
+Let's search my home folder for files that end in `[0-9].jpg`. It contains ~750.000
+subdirectories and about a 4 million files. For averaging and statistical analysis, I'm using
 [hyperfine](https://github.com/sharkdp/hyperfine). The following benchmarks are performed
 with a "warm"/pre-filled disk-cache (results for a "cold" disk-cache show the same trends).
 
 Let's start with `find`:
 ```
-Benchmark #1: find ~ -iregex '.*[0-9]\.jpg$'
-
-  Time (mean ± σ):      7.236 s ±  0.090 s
-
-  Range (min … max):    7.133 s …  7.385 s
+Benchmark 1: find ~ -iregex '.*[0-9]\.jpg$'
+  Time (mean ± σ):     19.922 s ±  0.109 s
+  Range (min … max):   19.765 s … 20.065 s
 ```
 
 `find` is much faster if it does not need to perform a regular-expression search:
 ```
-Benchmark #2: find ~ -iname '*[0-9].jpg'
-
-  Time (mean ± σ):      3.914 s ±  0.027 s
-
-  Range (min … max):    3.876 s …  3.964 s
+Benchmark 2: find ~ -iname '*[0-9].jpg'
+  Time (mean ± σ):     11.226 s ±  0.104 s
+  Range (min … max):   11.119 s … 11.466 s
 ```
 
-Now let's try the same for `fd`. Note that `fd` *always* performs a regular expression
-search. The options `--hidden` and `--no-ignore` are needed for a fair comparison,
-otherwise `fd` does not have to traverse hidden folders and ignored paths (see below):
+Now let's try the same for `fd`. Note that `fd` performs a regular expression
+search by default. The options `-u`/`--unrestricted` option is needed here for
+a fair comparison. Otherwise `fd` does not have to traverse hidden folders and
+ignored paths (see below):
 ```
-Benchmark #3: fd -HI '.*[0-9]\.jpg$' ~
-
-  Time (mean ± σ):     811.6 ms ±  26.9 ms
-
-  Range (min … max):   786.0 ms … 870.7 ms
+Benchmark 3: fd -u '[0-9]\.jpg$' ~
+  Time (mean ± σ):     854.8 ms ±  10.0 ms
+  Range (min … max):   839.2 ms … 868.9 ms
 ```
-For this particular example, `fd` is approximately nine times faster than `find -iregex`
-and about five times faster than `find -iname`. By the way, both tools found the exact
-same 20880 files :smile:.
+For this particular example, `fd` is approximately **23 times faster** than `find -iregex`
+and about **13 times faster** than `find -iname`. By the way, both tools found the exact
+same 546 files :smile:.
 
-Finally, let's run `fd` without `--hidden` and `--no-ignore` (this can lead to different
-search results, of course). If *fd* does not have to traverse the hidden and git-ignored
-folders, it is almost an order of magnitude faster:
-```
-Benchmark #4: fd '[0-9]\.jpg$' ~
-
-  Time (mean ± σ):     123.7 ms ±   6.0 ms
-
-  Range (min … max):   118.8 ms … 140.0 ms
-```
-
-**Note**: This is *one particular* benchmark on *one particular* machine. While I have
-performed quite a lot of different tests (and found consistent results), things might
-be different for you! I encourage everyone to try it out on their own. See
+**Note**: This is *one particular* benchmark on *one particular* machine. While we have
+performed a lot of different tests (and found consistent results), things might
+be different for you! We encourage everyone to try it out on their own. See
 [this repository](https://github.com/sharkdp/fd-benchmarks) for all necessary scripts.
 
-Concerning *fd*'s speed, the main credit goes to the `regex` and `ignore` crates that are also used
-in [ripgrep](https://github.com/BurntSushi/ripgrep) (check it out!).
+Concerning *fd*'s speed, a lot of credit goes to the `regex` and `ignore` crates that are
+also used in [ripgrep](https://github.com/BurntSushi/ripgrep) (check it out!).
 
 ## Troubleshooting
 
+### `fd` does not find my file!
+
+Remember that `fd` ignores hidden directories and files by default. It also ignores patterns
+from `.gitignore` files. If you want to make sure to find absolutely every possible file, always
+use the options `-u`/`--unrestricted` option (or `-HI` to enable hidden and ignored files):
+``` bash
+> fd -u …
+```
+
 ### Colorized output
 
 `fd` can colorize files by extension, just like `ls`. In order for this to work, the environment
@@ -402,15 +398,6 @@ for alternative, more complete (or more colorful) variants, see [here](https://g
 
 `fd` also honors the [`NO_COLOR`](https://no-color.org/) environment variable.
 
-### `fd` does not find my file!
-
-Remember that `fd` ignores hidden directories and files by default. It also ignores patterns
-from `.gitignore` files. If you want to make sure to find absolutely every possible file, always
-use the options `-H` and `-I` to disable these two features:
-``` bash
-> fd -HI …
-```
-
 ### `fd` doesn't seem to interpret my regex pattern correctly
 
 A lot of special regex characters (like `[]`, `^`, `$`, ..) are also special characters in your
@@ -543,7 +530,7 @@ Make sure that `$HOME/.local/bin` is in your `$PATH`.
 If you use an older version of Ubuntu, you can download the latest `.deb` package from the
 [release page](https://github.com/sharkdp/fd/releases) and install it via:
 ``` bash
-sudo dpkg -i fd_8.7.1_amd64.deb # adapt version number and architecture
+sudo dpkg -i fd_9.0.0_amd64.deb # adapt version number and architecture
 ```
 
 ### On Debian
@@ -677,7 +664,7 @@ With Rust's package manager [cargo](https://github.com/rust-lang/cargo), you can
 ```
 cargo install fd-find
 ```
-Note that rust version *1.64.0* or later is required.
+Note that rust version *1.70.0* or later is required.
 
 `make` is also needed for the build.
 
@@ -708,8 +695,6 @@ cargo install --path .
 
 ## License
 
-Copyright (c) 2017-2021 The fd developers
-
 `fd` is distributed under the terms of both the MIT License and the Apache License 2.0.
 
 See the [LICENSE-APACHE](LICENSE-APACHE) and [LICENSE-MIT](LICENSE-MIT) files for license details.
diff --git a/doc/release-checklist.md b/doc/release-checklist.md
index b11e75d..0a20802 100644
--- a/doc/release-checklist.md
+++ b/doc/release-checklist.md
@@ -9,7 +9,7 @@ necessary changes for the upcoming release.
 - [ ] Update version in `Cargo.toml`. Run `cargo build` to update `Cargo.lock`.
       Make sure to `git add` the `Cargo.lock` changes as well.
 - [ ] Find the current min. supported Rust version by running
-      `grep '^\s*MIN_SUPPORTED_RUST_VERSION' .github/workflows/CICD.yml`.
+      `grep rust-version Cargo.toml`.
 - [ ] Update the `fd` version and the min. supported Rust version in `README.md`.
 - [ ] Update `CHANGELOG.md`. Change the heading of the *"Upcoming release"* section
       to the version of this release.
diff --git a/src/cli.rs b/src/cli.rs
index c17f0b8..64ea111 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -715,24 +715,14 @@ impl Opts {
 fn default_num_threads() -> NonZeroUsize {
     // If we can't get the amount of parallelism for some reason, then
     // default to a single thread, because that is safe.
-    // Note that the minimum value for a NonZeroUsize is 1.
-    // Unfortunately, we can't do `NonZeroUsize::new(1).unwrap()`
-    // in a const context.
-    const FALLBACK_PARALLELISM: NonZeroUsize = NonZeroUsize::MIN;
-    // As the number of threads increases, the startup time suffers from
-    // initializing the threads, and we get diminishing returns from additional
-    // parallelism. So set a maximum number of threads to use by default.
-    //
-    // This value is based on some empirical observations, but the ideal value
-    // probably depends on the exact hardware in use.
-    //
-    // Safety: The literal "20" is known not to be zero.
-    const MAX_DEFAULT_THREADS: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(20) };
+    let fallback = NonZeroUsize::MIN;
+    // To limit startup overhead on massively parallel machines, don't use more
+    // than 64 threads.
+    let limit = NonZeroUsize::new(64).unwrap();
 
-    std::cmp::min(
-        std::thread::available_parallelism().unwrap_or(FALLBACK_PARALLELISM),
-        MAX_DEFAULT_THREADS,
-    )
+    std::thread::available_parallelism()
+        .unwrap_or(fallback)
+        .min(limit)
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, ValueEnum)]
@@ -768,17 +758,6 @@ pub enum ColorWhen {
     Never,
 }
 
-impl ColorWhen {
-    pub fn as_str(&self) -> &'static str {
-        use ColorWhen::*;
-        match *self {
-            Auto => "auto",
-            Never => "never",
-            Always => "always",
-        }
-    }
-}
-
 // there isn't a derive api for getting grouped values yet,
 // so we have to use hand-rolled parsing for exec and exec-batch
 pub struct Exec {
diff --git a/src/dir_entry.rs b/src/dir_entry.rs
index 3a19d59..f44f2be 100644
--- a/src/dir_entry.rs
+++ b/src/dir_entry.rs
@@ -8,11 +8,13 @@ use lscolors::{Colorable, LsColors, Style};
 use crate::config::Config;
 use crate::filesystem::strip_current_dir;
 
+#[derive(Debug)]
 enum DirEntryInner {
     Normal(ignore::DirEntry),
     BrokenSymlink(PathBuf),
 }
 
+#[derive(Debug)]
 pub struct DirEntry {
     inner: DirEntryInner,
     metadata: OnceCell<Option<Metadata>>,
diff --git a/src/exec/job.rs b/src/exec/job.rs
index af603cc..4864d6d 100644
--- a/src/exec/job.rs
+++ b/src/exec/job.rs
@@ -1,9 +1,6 @@
 use std::sync::Mutex;
 
-use crossbeam_channel::Receiver;
-
 use crate::config::Config;
-use crate::dir_entry::DirEntry;
 use crate::error::print_error;
 use crate::exit_codes::{merge_exitcodes, ExitCode};
 use crate::walk::WorkerResult;
@@ -14,7 +11,7 @@ use super::CommandSet;
 /// generate a command with the supplied command template. The generated command will then
 /// be executed, and this process will continue until the receiver's sender has closed.
 pub fn job(
-    rx: Receiver<WorkerResult>,
+    results: impl IntoIterator<Item = WorkerResult>,
     cmd: &CommandSet,
     out_perm: &Mutex<()>,
     config: &Config,
@@ -22,35 +19,39 @@ pub fn job(
     // Output should be buffered when only running a single thread
     let buffer_output: bool = config.threads > 1;
 
-    let mut results: Vec<ExitCode> = Vec::new();
-    loop {
+    let mut ret = ExitCode::Success;
+    for result in results {
         // Obtain the next result from the receiver, else if the channel
         // has closed, exit from the loop
-        let dir_entry: DirEntry = match rx.recv() {
-            Ok(WorkerResult::Entry(dir_entry)) => dir_entry,
-            Ok(WorkerResult::Error(err)) => {
+        let dir_entry = match result {
+            WorkerResult::Entry(dir_entry) => dir_entry,
+            WorkerResult::Error(err) => {
                 if config.show_filesystem_errors {
                     print_error(err.to_string());
                 }
                 continue;
             }
-            Err(_) => break,
         };
 
         // Generate a command, execute it and store its exit code.
-        results.push(cmd.execute(
+        let code = cmd.execute(
             dir_entry.stripped_path(config),
             config.path_separator.as_deref(),
             out_perm,
             buffer_output,
-        ))
+        );
+        ret = merge_exitcodes([ret, code]);
     }
     // Returns error in case of any error.
-    merge_exitcodes(results)
+    ret
 }
 
-pub fn batch(rx: Receiver<WorkerResult>, cmd: &CommandSet, config: &Config) -> ExitCode {
-    let paths = rx
+pub fn batch(
+    results: impl IntoIterator<Item = WorkerResult>,
+    cmd: &CommandSet,
+    config: &Config,
+) -> ExitCode {
+    let paths = results
         .into_iter()
         .filter_map(|worker_result| match worker_result {
             WorkerResult::Entry(dir_entry) => Some(dir_entry.into_stripped_path(config)),
diff --git a/src/main.rs b/src/main.rs
index 5440601..bef4120 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -325,18 +325,22 @@ fn extract_command(opts: &mut Opts, colored_output: bool) -> Result<Option<Comma
             if !opts.list_details {
                 return None;
             }
-            let color_arg = format!("--color={}", opts.color.as_str());
 
-            let res = determine_ls_command(&color_arg, colored_output)
+            let res = determine_ls_command(colored_output)
                 .map(|cmd| CommandSet::new_batch([cmd]).unwrap());
             Some(res)
         })
         .transpose()
 }
 
-fn determine_ls_command(color_arg: &str, colored_output: bool) -> Result<Vec<&str>> {
+fn determine_ls_command(colored_output: bool) -> Result<Vec<&'static str>> {
     #[allow(unused)]
     let gnu_ls = |command_name| {
+        let color_arg = if colored_output {
+            "--color=always"
+        } else {
+            "--color=never"
+        };
         // Note: we use short options here (instead of --long-options) to support more
         // platforms (like BusyBox).
         vec![
diff --git a/src/walk.rs b/src/walk.rs
index 691c5d0..c81d2a4 100644
--- a/src/walk.rs
+++ b/src/walk.rs
@@ -4,12 +4,12 @@ use std::io::{self, Write};
 use std::mem;
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, MutexGuard};
 use std::thread;
 use std::time::{Duration, Instant};
 
 use anyhow::{anyhow, Result};
-use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, Sender};
+use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, SendError, Sender};
 use etcetera::BaseStrategy;
 use ignore::overrides::{Override, OverrideBuilder};
 use ignore::{self, WalkBuilder, WalkParallel, WalkState};
@@ -36,6 +36,7 @@ enum ReceiverMode {
 
 /// The Worker threads can result in a valid entry having PathBuf or an error.
 #[allow(clippy::large_enum_variant)]
+#[derive(Debug)]
 pub enum WorkerResult {
     // Errors should be rare, so it's probably better to allow large_enum_variant than
     // to box the Entry variant
@@ -43,6 +44,83 @@ pub enum WorkerResult {
     Error(ignore::Error),
 }
 
+/// A batch of WorkerResults to send over a channel.
+#[derive(Clone)]
+struct Batch {
+    items: Arc<Mutex<Option<Vec<WorkerResult>>>>,
+}
+
+impl Batch {
+    fn new() -> Self {
+        Self {
+            items: Arc::new(Mutex::new(Some(vec![]))),
+        }
+    }
+
+    fn lock(&self) -> MutexGuard<'_, Option<Vec<WorkerResult>>> {
+        self.items.lock().unwrap()
+    }
+}
+
+impl IntoIterator for Batch {
+    type Item = WorkerResult;
+    type IntoIter = std::vec::IntoIter<WorkerResult>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.lock().take().unwrap().into_iter()
+    }
+}
+
+/// Wrapper that sends batches of items at once over a channel.
+struct BatchSender {
+    batch: Batch,
+    tx: Sender<Batch>,
+    limit: usize,
+}
+
+impl BatchSender {
+    fn new(tx: Sender<Batch>, limit: usize) -> Self {
+        Self {
+            batch: Batch::new(),
+            tx,
+            limit,
+        }
+    }
+
+    /// Check if we need to flush a batch.
+    fn needs_flush(&self, batch: Option<&Vec<WorkerResult>>) -> bool {
+        match batch {
+            // Limit the batch size to provide some backpressure
+            Some(vec) => vec.len() >= self.limit,
+            // Batch was already taken by the receiver, so make a new one
+            None => true,
+        }
+    }
+
+    /// Add an item to a batch.
+    fn send(&mut self, item: WorkerResult) -> Result<(), SendError<()>> {
+        let mut batch = self.batch.lock();
+
+        if self.needs_flush(batch.as_ref()) {
+            drop(batch);
+            self.batch = Batch::new();
+            batch = self.batch.lock();
+        }
+
+        let items = batch.as_mut().unwrap();
+        items.push(item);
+
+        if items.len() == 1 {
+            // New batch, send it over the channel
+            self.tx
+                .send(self.batch.clone())
+                .map_err(|_| SendError(()))?;
+        }
+
+        Ok(())
+    }
+}
+
 /// Maximum size of the output buffer before flushing results to the console
 const MAX_BUFFER_LENGTH: usize = 1000;
 /// Default duration until output buffering switches to streaming.
@@ -57,7 +135,7 @@ struct ReceiverBuffer<'a, W> {
     /// The ^C notifier.
     interrupt_flag: &'a AtomicBool,
     /// Receiver for worker results.
-    rx: Receiver<WorkerResult>,
+    rx: Receiver<Batch>,
     /// Standard output.
     stdout: W,
     /// The current buffer mode.
@@ -72,7 +150,7 @@ struct ReceiverBuffer<'a, W> {
 
 impl<'a, W: Write> ReceiverBuffer<'a, W> {
     /// Create a new receiver buffer.
-    fn new(state: &'a WorkerState, rx: Receiver<WorkerResult>, stdout: W) -> Self {
+    fn new(state: &'a WorkerState, rx: Receiver<Batch>, stdout: W) -> Self {
         let config = &state.config;
         let quit_flag = state.quit_flag.as_ref();
         let interrupt_flag = state.interrupt_flag.as_ref();
@@ -103,7 +181,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> {
     }
 
     /// Receive the next worker result.
-    fn recv(&self) -> Result<WorkerResult, RecvTimeoutError> {
+    fn recv(&self) -> Result<Batch, RecvTimeoutError> {
         match self.mode {
             ReceiverMode::Buffering => {
                 // Wait at most until we should switch to streaming
@@ -119,34 +197,44 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> {
     /// Wait for a result or state change.
     fn poll(&mut self) -> Result<(), ExitCode> {
         match self.recv() {
-            Ok(WorkerResult::Entry(dir_entry)) => {
-                if self.config.quiet {
-                    return Err(ExitCode::HasResults(true));
-                }
+            Ok(batch) => {
+                for result in batch {
+                    match result {
+                        WorkerResult::Entry(dir_entry) => {
+                            if self.config.quiet {
+                                return Err(ExitCode::HasResults(true));
+                            }
 
-                match self.mode {
-                    ReceiverMode::Buffering => {
-                        self.buffer.push(dir_entry);
-                        if self.buffer.len() > MAX_BUFFER_LENGTH {
-                            self.stream()?;
+                            match self.mode {
+                                ReceiverMode::Buffering => {
+                                    self.buffer.push(dir_entry);
+                                    if self.buffer.len() > MAX_BUFFER_LENGTH {
+                                        self.stream()?;
+                                    }
+                                }
+                                ReceiverMode::Streaming => {
+                                    self.print(&dir_entry)?;
+                                }
+                            }
+
+                            self.num_results += 1;
+                            if let Some(max_results) = self.config.max_results {
+                                if self.num_results >= max_results {
+                                    return self.stop();
+                                }
+                            }
+                        }
+                        WorkerResult::Error(err) => {
+                            if self.config.show_filesystem_errors {
+                                print_error(err.to_string());
+                            }
                         }
                     }
-                    ReceiverMode::Streaming => {
-                        self.print(&dir_entry)?;
-                        self.flush()?;
-                    }
                 }
 
-                self.num_results += 1;
-                if let Some(max_results) = self.config.max_results {
-                    if self.num_results >= max_results {
-                        return self.stop();
-                    }
-                }
-            }
-            Ok(WorkerResult::Error(err)) => {
-                if self.config.show_filesystem_errors {
-                    print_error(err.to_string());
+                // If we don't have another batch ready, flush before waiting
+                if self.mode == ReceiverMode::Streaming && self.rx.is_empty() {
+                    self.flush()?;
                 }
             }
             Err(RecvTimeoutError::Timeout) => {
@@ -201,7 +289,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> {
 
     /// Flush stdout if necessary.
     fn flush(&mut self) -> Result<(), ExitCode> {
-        if self.config.interactive_terminal && self.stdout.flush().is_err() {
+        if self.stdout.flush().is_err() {
             // Probably a broken pipe. Exit gracefully.
             return Err(ExitCode::GeneralError);
         }
@@ -319,13 +407,13 @@ impl WorkerState {
 
     /// Run the receiver work, either on this thread or a pool of background
     /// threads (for --exec).
-    fn receive(&self, rx: Receiver<WorkerResult>) -> ExitCode {
+    fn receive(&self, rx: Receiver<Batch>) -> ExitCode {
         let config = &self.config;
 
         // This will be set to `Some` if the `--exec` argument was supplied.
         if let Some(ref cmd) = config.command {
             if cmd.in_batch_mode() {
-                exec::batch(rx, cmd, &config)
+                exec::batch(rx.into_iter().flatten(), cmd, &config)
             } else {
                 let out_perm = Mutex::new(());
 
@@ -337,7 +425,8 @@ impl WorkerState {
                         let rx = rx.clone();
 
                         // Spawn a job thread that will listen for and execute inputs.
-                        let handle = scope.spawn(|| exec::job(rx, cmd, &out_perm, &config));
+                        let handle = scope
+                            .spawn(|| exec::job(rx.into_iter().flatten(), cmd, &out_perm, &config));
 
                         // Push the handle of the spawned thread into the vector for later joining.
                         handles.push(handle);
@@ -355,12 +444,20 @@ impl WorkerState {
     }
 
     /// Spawn the sender threads.
-    fn spawn_senders(&self, walker: WalkParallel, tx: Sender<WorkerResult>) {
+    fn spawn_senders(&self, walker: WalkParallel, tx: Sender<Batch>) {
         walker.run(|| {
             let patterns = &self.patterns;
             let config = &self.config;
             let quit_flag = self.quit_flag.as_ref();
-            let tx = tx.clone();
+
+            let mut limit = 0x100;
+            if let Some(cmd) = &config.command {
+                if !cmd.in_batch_mode() && config.threads > 1 {
+                    // Evenly distribute work between multiple receivers
+                    limit = 1;
+                }
+            }
+            let mut tx = BatchSender::new(tx.clone(), limit);
 
             Box::new(move |entry| {
                 if quit_flag.load(Ordering::Relaxed) {
@@ -545,8 +642,7 @@ impl WorkerState {
             .unwrap();
         }
 
-        // Channel capacity was chosen empircally to perform similarly to an unbounded channel
-        let (tx, rx) = bounded(0x4000 * config.threads);
+        let (tx, rx) = bounded(2 * config.threads);
 
         let exit_code = thread::scope(|scope| {
             // Spawn the receiver thread(s)