diff --git a/CHANGELOG.md b/CHANGELOG.md index cd148a62..e66bfddb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,15 @@ - Python syntax highlighting no longer suffers from abysmal performance in specific scenarios. See #1688 (@keith-hall) +## Performance + +- Load cached assets as fast as integrated assets, see #1753 (@Enselic) +- Greatly reduce startup time in loop-through mode, e.g. when redirecting output. Instead of *50 ms* - *100 ms*, startup takes *5 ms* - *10 ms*. See #1747 (@Enselic) +- Reduce startup time by approximately 80% for 91 out of 168 syntaxes when using `--language`. See #1787 (@Enselic) + ## Other - Add PowerShell completion, see #1826 (@rashil2000) -- Load cached assets as fast as integrated assets, see #1753 (@Enselic) -- Greatly reduce startup time in loop-through mode, e.g. when redirecting output. Instead of *50 ms* - *100 ms*, startup takes *5 ms* - *10 ms*. See #1747 (@Enselic) - ## Syntaxes diff --git a/assets/minimal_syntaxes.bin b/assets/minimal_syntaxes.bin new file mode 100644 index 00000000..96be60c8 Binary files /dev/null and b/assets/minimal_syntaxes.bin differ diff --git a/src/assets.rs b/src/assets.rs index 26de12e7..e315a402 100644 --- a/src/assets.rs +++ b/src/assets.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::ffi::OsStr; use std::fs; use std::path::{Path, PathBuf}; @@ -18,6 +19,14 @@ use crate::syntax_mapping::{MappingTarget, SyntaxMapping}; pub struct HighlightingAssets { syntax_set_cell: LazyCell, serialized_syntax_set: SerializedSyntaxSet, + + minimal_syntaxes: MinimalSyntaxes, + + /// Lazily load serialized [SyntaxSet]s from [Self.minimal_syntaxes]. The + /// index in this vec matches the index in + /// [Self.minimal_syntaxes.serialized_syntax_sets] + deserialized_minimal_syntaxes: Vec>, + theme_set: ThemeSet, fallback_theme: Option<&'static str>, } @@ -28,12 +37,39 @@ pub struct SyntaxReferenceInSet<'a> { pub syntax_set: &'a SyntaxSet, } +/// Stores and allows lookup of minimal [SyntaxSet]s. The [SyntaxSet]s are +/// stored in serialized form, and are deserialized on-demand. This gives good +/// startup performance since only the necessary [SyntaxReference]s needs to be +/// deserialized. +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub(crate) struct MinimalSyntaxes { + /// Lookup the index into `serialized_syntax_sets` of a [SyntaxSet] by the + /// name of any [SyntaxReference] inside the [SyntaxSet] + /// (We will later add `by_extension`, `by_first_line`, etc.) + pub(crate) by_name: HashMap, + + /// Serialized [SyntaxSet]s. Whether or not this data is compressed is + /// decided by [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES] + pub(crate) serialized_syntax_sets: Vec>, +} + // Compress for size of ~700 kB instead of ~4600 kB at the cost of ~30% longer deserialization time pub(crate) const COMPRESS_SYNTAXES: bool = true; // Compress for size of ~20 kB instead of ~200 kB at the cost of ~30% longer deserialization time pub(crate) const COMPRESS_THEMES: bool = true; +// Compress for size of ~400 kB instead of ~2100 kB at the cost of ~30% longer deserialization time +pub(crate) const COMPRESS_SERIALIZED_MINIMAL_SYNTAXES: bool = true; + +// Whether or not to compress the serialized form of [MinimalSyntaxes]. Shall +// always be `false`, because the data in +// [MinimalSyntaxes.serialized_syntax_sets] has already been compressed +// (assuming [COMPRESS_SERIALIZED_MINIMAL_SYNTAXES] is `true`). The "outer" data +// structures like `by_name` are tiny. If we compress, deserialization can't do +// efficient byte-by-byte copy of `serialized_syntax_sets`. +pub(crate) const COMPRESS_MINIMAL_SYNTAXES: bool = false; + const IGNORED_SUFFIXES: [&str; 13] = [ // Editor etc backups "~", @@ -55,10 +91,20 @@ const IGNORED_SUFFIXES: [&str; 13] = [ ]; impl HighlightingAssets { - fn new(serialized_syntax_set: SerializedSyntaxSet, theme_set: ThemeSet) -> Self { + fn new( + serialized_syntax_set: SerializedSyntaxSet, + minimal_syntaxes: MinimalSyntaxes, + theme_set: ThemeSet, + ) -> Self { + // Prepare so we can lazily load minimal syntaxes without a mut reference + let deserialized_minimal_syntaxes = + vec![LazyCell::new(); minimal_syntaxes.serialized_syntax_sets.len()]; + HighlightingAssets { syntax_set_cell: LazyCell::new(), serialized_syntax_set, + deserialized_minimal_syntaxes, + minimal_syntaxes, theme_set, fallback_theme: None, } @@ -71,6 +117,11 @@ impl HighlightingAssets { pub fn from_cache(cache_path: &Path) -> Result { Ok(HighlightingAssets::new( SerializedSyntaxSet::FromFile(cache_path.join("syntaxes.bin")), + asset_from_cache( + &cache_path.join("minimal_syntaxes.bin"), + "minimal syntax sets", + COMPRESS_MINIMAL_SYNTAXES, + )?, asset_from_cache(&cache_path.join("themes.bin"), "theme set", COMPRESS_THEMES)?, )) } @@ -78,6 +129,7 @@ impl HighlightingAssets { pub fn from_binary() -> Self { HighlightingAssets::new( SerializedSyntaxSet::FromBinary(get_serialized_integrated_syntaxset()), + get_integrated_minimal_syntaxes(), get_integrated_themeset(), ) } @@ -111,6 +163,41 @@ impl HighlightingAssets { self.get_theme_set().themes.keys().map(|s| s.as_ref()) } + /// Finds a [SyntaxSet] that contains a [SyntaxReference] by its name. First + /// tries to find a minimal [SyntaxSet]. If none is found, returns the + /// [SyntaxSet] that contains all syntaxes. + fn get_syntax_set_by_name(&self, name: &str) -> Result<&SyntaxSet> { + let minimal_syntax_set = self + .minimal_syntaxes + .by_name + .get(&name.to_ascii_lowercase()) + .and_then(|index| self.get_minimal_syntax_set_with_index(*index)); + + match minimal_syntax_set { + Some(syntax_set) => Ok(syntax_set), + None => self.get_syntax_set(), + } + } + + fn load_minimal_syntax_set_with_index(&self, index: usize) -> Result { + let serialized_syntax_set = &self.minimal_syntaxes.serialized_syntax_sets[index]; + asset_from_contents( + &serialized_syntax_set[..], + &format!("minimal syntax set {}", index), + COMPRESS_SERIALIZED_MINIMAL_SYNTAXES, + ) + .map_err(|_| format!("Could not parse minimal syntax set {}", index).into()) + } + + fn get_minimal_syntax_set_with_index(&self, index: usize) -> Option<&SyntaxSet> { + self.deserialized_minimal_syntaxes + .get(index) + .and_then(|cell| { + cell.try_borrow_with(|| self.load_minimal_syntax_set_with_index(index)) + .ok() + }) + } + /// Use [Self::get_syntax_for_file_name] instead #[deprecated] pub fn syntax_for_file_name( @@ -167,7 +254,7 @@ impl HighlightingAssets { mapping: &SyntaxMapping, ) -> Result { if let Some(language) = language { - let syntax_set = self.get_syntax_set()?; + let syntax_set = self.get_syntax_set_by_name(language)?; syntax_set .find_syntax_by_token(language) .map(|syntax| SyntaxReferenceInSet { syntax, syntax_set }) @@ -320,6 +407,13 @@ pub(crate) fn get_integrated_themeset() -> ThemeSet { from_binary(include_bytes!("../assets/themes.bin"), COMPRESS_THEMES) } +fn get_integrated_minimal_syntaxes() -> MinimalSyntaxes { + from_binary( + include_bytes!("../assets/minimal_syntaxes.bin"), + COMPRESS_MINIMAL_SYNTAXES, + ) +} + pub(crate) fn from_binary(v: &[u8], compressed: bool) -> T { asset_from_contents(v, "n/a", compressed) .expect("data integrated in binary is never faulty, but make sure `compressed` is in sync!") diff --git a/src/bin/bat/assets.rs b/src/bin/bat/assets.rs index 5e75492f..b775a4d9 100644 --- a/src/bin/bat/assets.rs +++ b/src/bin/bat/assets.rs @@ -20,6 +20,7 @@ pub fn cache_dir() -> Cow<'static, str> { pub fn clear_assets() { clear_asset("themes.bin", "theme set cache"); clear_asset("syntaxes.bin", "syntax set cache"); + clear_asset("minimal_syntaxes.bin", "minimal syntax sets cache"); clear_asset("metadata.yaml", "metadata file"); } diff --git a/src/build_assets.rs b/src/build_assets.rs index 43687d5e..75e6d5bc 100644 --- a/src/build_assets.rs +++ b/src/build_assets.rs @@ -37,17 +37,19 @@ pub fn build_assets( let syntax_set_builder = build_syntax_set_builder(source_dir, include_integrated_assets)?; - if std::env::var("BAT_PRINT_SYNTAX_DEPENDENCIES").is_ok() { - // To trigger this code, run: - // BAT_PRINT_SYNTAX_DEPENDENCIES=1 cargo run -- cache --build --source assets --blank --target /tmp - print_syntax_dependencies(&syntax_set_builder); - } + let minimal_syntaxes = build_minimal_syntaxes(&syntax_set_builder, include_integrated_assets)?; let syntax_set = syntax_set_builder.build(); print_unlinked_contexts(&syntax_set); - write_assets(&theme_set, &syntax_set, target_dir, current_version) + write_assets( + &theme_set, + &syntax_set, + &minimal_syntaxes, + target_dir, + current_version, + ) } fn build_theme_set(source_dir: &Path, include_integrated_assets: bool) -> ThemeSet { @@ -116,6 +118,7 @@ fn print_unlinked_contexts(syntax_set: &SyntaxSet) { fn write_assets( theme_set: &ThemeSet, syntax_set: &SyntaxSet, + minimal_syntaxes: &MinimalSyntaxes, target_dir: &Path, current_version: &str, ) -> Result<()> { @@ -132,6 +135,12 @@ fn write_assets( "syntax set", COMPRESS_SYNTAXES, )?; + asset_to_cache( + minimal_syntaxes, + &target_dir.join("minimal_syntaxes.bin"), + "minimal syntax sets", + COMPRESS_MINIMAL_SYNTAXES, + )?; print!( "Writing metadata to folder {} ... ", @@ -143,26 +152,65 @@ fn write_assets( Ok(()) } -/// Generates independent [SyntaxSet]s after analyzing dependencies between syntaxes -/// in a [SyntaxSetBuilder], and then prints the reults. -fn print_syntax_dependencies(syntax_set_builder: &SyntaxSetBuilder) { - println!("Constructing independent SyntaxSets..."); - let independent_syntax_sets = build_independent_syntax_sets(syntax_set_builder); +fn print_syntax_set_names(syntax_set: &SyntaxSet) { + let names = syntax_set + .syntaxes() + .iter() + .map(|syntax| &syntax.name) + .collect::>(); + println!("{:?}", names); +} - println!("Independent SyntaxSets:"); - for syntax_set in independent_syntax_sets { - let names = syntax_set - .syntaxes() - .iter() - .map(|syntax| &syntax.name) - .collect::>(); - println!("{:?}", names); +fn build_minimal_syntaxes( + syntax_set_builder: &'_ SyntaxSetBuilder, + include_integrated_assets: bool, +) -> Result { + let mut minimal_syntaxes = MinimalSyntaxes { + by_name: HashMap::new(), + serialized_syntax_sets: vec![], + }; + + if include_integrated_assets { + // Dependency info is not present in integrated assets, so we can't + // calculate minimal syntax sets. Return early without any data filled + // in. This means that no minimal syntax sets will be available to use, and + // the full, slow-to-deserialize, fallback syntax set will be used instead. + return Ok(minimal_syntaxes); } + + let minimal_syntax_sets_to_serialize = build_minimal_syntax_sets(syntax_set_builder) + // For now, only store syntax sets with one syntax, otherwise + // the binary grows by several megs + .filter(|syntax_set| syntax_set.syntaxes().len() == 1); + + for minimal_syntax_set in minimal_syntax_sets_to_serialize { + // Remember what index it is found at + let current_index = minimal_syntaxes.serialized_syntax_sets.len(); + + for syntax in minimal_syntax_set.syntaxes() { + minimal_syntaxes + .by_name + .insert(syntax.name.to_ascii_lowercase().clone(), current_index); + } + + let serialized_syntax_set = asset_to_contents( + &minimal_syntax_set, + &format!("failed to serialize minimal syntax set {}", current_index), + COMPRESS_SERIALIZED_MINIMAL_SYNTAXES, + )?; + + // Add last so that it ends up at `current_index` + minimal_syntaxes + .serialized_syntax_sets + .push(serialized_syntax_set); + } + + Ok(minimal_syntaxes) } /// Analyzes dependencies between syntaxes in a [SyntaxSetBuilder]. -/// From that, it builds independent [SyntaxSet]s. -fn build_independent_syntax_sets( +/// From that, it builds minimal [SyntaxSet]s. +fn build_minimal_syntax_sets( syntax_set_builder: &'_ SyntaxSetBuilder, ) -> impl Iterator + '_ { let syntaxes = syntax_set_builder.syntaxes(); @@ -170,7 +218,7 @@ fn build_independent_syntax_sets( // Build the data structures we need for dependency resolution let (syntax_to_dependencies, dependency_to_syntax) = generate_maps(syntaxes); - // Create one independent SyntaxSet from each (non-hidden) SyntaxDefinition + // Create one minimal SyntaxSet from each (non-hidden) SyntaxDefinition syntaxes.iter().filter_map(move |syntax| { if syntax.hidden { return None; @@ -178,7 +226,15 @@ fn build_independent_syntax_sets( let mut builder = SyntaxSetDependencyBuilder::new(); builder.add_with_dependencies(syntax, &syntax_to_dependencies, &dependency_to_syntax); - Some(builder.build()) + let syntax_set = builder.build(); + + if std::env::var("BAT_PRINT_SYNTAX_DEPENDENCIES").is_ok() { + // To trigger this code, run: + // BAT_PRINT_SYNTAX_DEPENDENCIES=1 cargo run -- cache --build --source assets --blank --target /tmp + print_syntax_set_names(&syntax_set); + } + + Some(syntax_set) }) }