Merge pull request #2755 from cyqsimon/syntax-mapping-refactor

More extensible syntax mapping mechanism
This commit is contained in:
David Peter 2024-01-21 19:43:24 +01:00 committed by GitHub
commit db66e4459b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
45 changed files with 932 additions and 234 deletions

View File

@ -15,6 +15,7 @@
- Minor benchmark script improvements #2768 (@cyqsimon)
- Update Arch Linux package URL in README files #2779 (@brunobell)
- Update and improve `zsh` completion, see #2772 (@okapia)
- More extensible syntax mapping mechanism #2755 (@cyqsimon)
- Use proper Architecture for Debian packages built for musl, see #2811 (@Enselic)
- Pull in fix for unsafe-libyaml security advisory, see #2812 (@dtolnay)
- Update git-version dependency to use Syn v2, see #2816 (@dtolnay)
@ -28,6 +29,10 @@
## `bat` as a library
- Changes to `syntax_mapping::SyntaxMapping` #2755 (@cyqsimon)
- `SyntaxMapping::get_syntax_for` is now correctly public
- [BREAKING] `SyntaxMapping::{empty,builtin}` are removed; use `SyntaxMapping::new` instead
- [BREAKING] `SyntaxMapping::mappings` is replaced by `SyntaxMapping::{builtin,custom,all}_mappings`
- Make `Controller::run_with_error_handler`'s error handler `FnMut`, see #2831 (@rhysd)
# v0.24.0

132
Cargo.lock generated
View File

@ -129,6 +129,8 @@ dependencies = [
"globset",
"grep-cli",
"home",
"indexmap 2.1.0",
"itertools",
"nix",
"nu-ansi-term",
"once_cell",
@ -140,12 +142,14 @@ dependencies = [
"run_script",
"semver",
"serde",
"serde_with",
"serde_yaml",
"serial_test",
"shell-words",
"syntect",
"tempfile",
"thiserror",
"toml",
"unicode-width",
"wait-timeout",
"walkdir",
@ -224,11 +228,12 @@ checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc"
[[package]]
name = "cc"
version = "1.0.73"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
dependencies = [
"jobserver",
"libc",
]
[[package]]
@ -314,6 +319,41 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "darling"
version = "0.20.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0209d94da627ab5605dcccf08bb18afa5009cfbef48d8a8b7d7bdbc79be25c5e"
dependencies = [
"darling_core",
"darling_macro",
]
[[package]]
name = "darling_core"
version = "0.20.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "177e3443818124b357d8e76f53be906d60937f0d3a90773a664fa63fa253e621"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim",
"syn",
]
[[package]]
name = "darling_macro"
version = "0.20.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5"
dependencies = [
"darling_core",
"quote",
"syn",
]
[[package]]
name = "dashmap"
version = "5.4.0"
@ -578,6 +618,12 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.3.0"
@ -600,12 +646,13 @@ dependencies = [
[[package]]
name = "indexmap"
version = "2.0.2"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897"
checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
dependencies = [
"equivalent",
"hashbrown 0.14.1",
"serde",
]
[[package]]
@ -1101,13 +1148,44 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_spanned"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12022b835073e5b11e90a14f86838ceb1c8fb0325b72416845c487ac0fa95e80"
dependencies = [
"serde",
]
[[package]]
name = "serde_with"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64cd236ccc1b7a29e7e2739f27c0b2dd199804abc4290e32f59f3b68d6405c23"
dependencies = [
"serde",
"serde_with_macros",
]
[[package]]
name = "serde_with_macros"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93634eb5f75a2323b16de4748022ac4297f9e76b6dced2be287a099f41b5e788"
dependencies = [
"darling",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_yaml"
version = "0.9.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a15e0ef66bf939a7c890a0bf6d5a733c70202225f9888a89ed5c62298b019129"
dependencies = [
"indexmap 2.0.2",
"indexmap 2.1.0",
"itoa",
"ryu",
"serde",
@ -1294,6 +1372,41 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "toml"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ff9e3abce27ee2c9a37f9ad37238c1bdd4e789c84ba37df76aa4d528f5072cc"
dependencies = [
"indexmap 2.1.0",
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
]
[[package]]
name = "toml_datetime"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.20.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70f427fce4d84c72b5b732388bf4a9f4531b53f74e2887e3ecb2481f68f66d81"
dependencies = [
"indexmap 2.1.0",
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
]
[[package]]
name = "unicode-bidi"
version = "0.3.8"
@ -1613,6 +1726,15 @@ version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
[[package]]
name = "winnow"
version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "176b6138793677221d420fd2f0aeeced263f197688b36484660da767bca2fa32"
dependencies = [
"memchr",
]
[[package]]
name = "yaml-rust"
version = "0.4.5"

View File

@ -100,6 +100,14 @@ nix = { version = "0.26.4", default-features = false, features = ["term"] }
[build-dependencies]
anyhow = "1.0.78"
indexmap = { version = "2.1.0", features = ["serde"] }
itertools = "0.11.0"
once_cell = "1.18"
regex = "1.10.2"
serde = { version = "1.0", features = ["derive"] }
serde_with = { version = "3.4.0", default-features = false, features = ["macros"] }
toml = { version = "0.8.6", features = ["preserve_order"] }
walkdir = "2.4"
[build-dependencies.clap]
version = "4.4.12"

View File

@ -1,5 +1,6 @@
#[cfg(feature = "application")]
mod application;
mod syntax_mapping;
mod util;
fn main() -> anyhow::Result<()> {
@ -7,6 +8,8 @@ fn main() -> anyhow::Result<()> {
// see: https://doc.rust-lang.org/cargo/reference/build-scripts.html#rerun-if-changed
println!("cargo:rerun-if-changed=build/");
syntax_mapping::build_static_mappings()?;
#[cfg(feature = "application")]
application::gen_man_and_comp()?;

292
build/syntax_mapping.rs Normal file
View File

@ -0,0 +1,292 @@
use std::{
convert::Infallible,
env, fs,
path::{Path, PathBuf},
str::FromStr,
};
use anyhow::{anyhow, bail};
use indexmap::IndexMap;
use itertools::Itertools;
use once_cell::sync::Lazy;
use regex::Regex;
use serde::Deserialize;
use serde_with::DeserializeFromStr;
use walkdir::WalkDir;
/// Known mapping targets.
///
/// Corresponds to `syntax_mapping::MappingTarget`.
#[allow(clippy::enum_variant_names)]
#[derive(Clone, Debug, Eq, PartialEq, Hash, DeserializeFromStr)]
pub enum MappingTarget {
MapTo(String),
MapToUnknown,
MapExtensionToUnknown,
}
impl FromStr for MappingTarget {
type Err = Infallible;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"MappingTarget::MapToUnknown" => Ok(Self::MapToUnknown),
"MappingTarget::MapExtensionToUnknown" => Ok(Self::MapExtensionToUnknown),
syntax => Ok(Self::MapTo(syntax.into())),
}
}
}
impl MappingTarget {
fn codegen(&self) -> String {
match self {
Self::MapTo(syntax) => format!(r###"MappingTarget::MapTo(r#"{syntax}"#)"###),
Self::MapToUnknown => "MappingTarget::MapToUnknown".into(),
Self::MapExtensionToUnknown => "MappingTarget::MapExtensionToUnknown".into(),
}
}
}
#[derive(Clone, Debug, PartialEq, Eq, Hash, DeserializeFromStr)]
/// A single matcher.
///
/// Codegen converts this into a `Lazy<Option<GlobMatcher>>`.
struct Matcher(Vec<MatcherSegment>);
/// Parse a matcher.
///
/// Note that this implementation is rather strict: it will greedily interpret
/// every valid environment variable replacement as such, then immediately
/// hard-error if it finds a '$', '{', or '}' anywhere in the remaining text
/// segments.
///
/// The reason for this strictness is I currently cannot think of a valid reason
/// why you would ever need '$', '{', or '}' as plaintext in a glob pattern.
/// Therefore any such occurrences are likely human errors.
///
/// If we later discover some edge cases, it's okay to make it more permissive.
impl FromStr for Matcher {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
use MatcherSegment as Seg;
static VAR_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\$\{([\w\d_]+)\}").unwrap());
let mut segments = vec![];
let mut text_start = 0;
for capture in VAR_REGEX.captures_iter(s) {
let match_0 = capture.get(0).unwrap();
// text before this var
let text_end = match_0.start();
segments.push(Seg::Text(s[text_start..text_end].into()));
text_start = match_0.end();
// this var
segments.push(Seg::Env(capture.get(1).unwrap().as_str().into()));
}
// possible trailing text
segments.push(Seg::Text(s[text_start..].into()));
// cleanup empty text segments
let non_empty_segments = segments
.into_iter()
.filter(|seg| seg.text().map(|t| !t.is_empty()).unwrap_or(true))
.collect_vec();
// sanity check
if non_empty_segments
.windows(2)
.any(|segs| segs[0].is_text() && segs[1].is_text())
{
unreachable!("Parsed into consecutive text segments: {non_empty_segments:?}");
}
// guard empty case
if non_empty_segments.is_empty() {
bail!(r#"Parsed an empty matcher: "{s}""#);
}
// guard variable syntax leftover fragments
if non_empty_segments
.iter()
.filter_map(Seg::text)
.any(|t| t.contains(['$', '{', '}']))
{
bail!(r#"Invalid matcher: "{s}""#);
}
Ok(Self(non_empty_segments))
}
}
impl Matcher {
fn codegen(&self) -> String {
match self.0.len() {
0 => unreachable!("0-length matcher should never be created"),
// if-let guard would be ideal here
// see: https://github.com/rust-lang/rust/issues/51114
1 if self.0[0].is_text() => {
let s = self.0[0].text().unwrap();
format!(r###"Lazy::new(|| Some(build_matcher_fixed(r#"{s}"#)))"###)
}
// parser logic ensures that this case can only happen when there are dynamic segments
_ => {
let segs = self.0.iter().map(MatcherSegment::codegen).join(", ");
format!(r###"Lazy::new(|| build_matcher_dynamic(&[{segs}]))"###)
}
}
}
}
/// A segment in a matcher.
///
/// Corresponds to `syntax_mapping::MatcherSegment`.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum MatcherSegment {
Text(String),
Env(String),
}
#[allow(dead_code)]
impl MatcherSegment {
fn is_text(&self) -> bool {
matches!(self, Self::Text(_))
}
fn is_env(&self) -> bool {
matches!(self, Self::Env(_))
}
fn text(&self) -> Option<&str> {
match self {
Self::Text(t) => Some(t),
Self::Env(_) => None,
}
}
fn env(&self) -> Option<&str> {
match self {
Self::Text(_) => None,
Self::Env(t) => Some(t),
}
}
fn codegen(&self) -> String {
match self {
Self::Text(s) => format!(r###"MatcherSegment::Text(r#"{s}"#)"###),
Self::Env(s) => format!(r###"MatcherSegment::Env(r#"{s}"#)"###),
}
}
}
/// A struct that models a single .toml file in /src/syntax_mapping/builtins/.
#[derive(Clone, Debug, Deserialize)]
struct MappingDefModel {
mappings: IndexMap<MappingTarget, Vec<Matcher>>,
}
impl MappingDefModel {
fn into_mapping_list(self) -> MappingList {
let list = self
.mappings
.into_iter()
.flat_map(|(target, matchers)| {
matchers
.into_iter()
.map(|matcher| (matcher, target.clone()))
.collect::<Vec<_>>()
})
.collect();
MappingList(list)
}
}
#[derive(Clone, Debug)]
struct MappingList(Vec<(Matcher, MappingTarget)>);
impl MappingList {
fn codegen(&self) -> String {
let array_items: Vec<_> = self
.0
.iter()
.map(|(matcher, target)| {
format!("({m}, {t})", m = matcher.codegen(), t = target.codegen())
})
.collect();
let len = array_items.len();
format!(
"/// Generated by build script from /src/syntax_mapping/builtins/.\n\
pub(crate) static BUILTIN_MAPPINGS: [(Lazy<Option<GlobMatcher>>, MappingTarget); {len}] = [\n{items}\n];",
items = array_items.join(",\n")
)
}
}
/// Get the list of paths to all mapping definition files that should be
/// included for the current target platform.
fn get_def_paths() -> anyhow::Result<Vec<PathBuf>> {
let source_subdirs = [
"common",
#[cfg(target_family = "unix")]
"unix-family",
#[cfg(any(
target_os = "freebsd",
target_os = "netbsd",
target_os = "openbsd",
target_os = "macos"
))]
"bsd-family",
#[cfg(target_os = "linux")]
"linux",
#[cfg(target_os = "macos")]
"macos",
#[cfg(target_os = "windows")]
"windows",
];
let mut toml_paths = vec![];
for subdir in source_subdirs {
let wd = WalkDir::new(Path::new("src/syntax_mapping/builtins").join(subdir));
let paths = wd
.into_iter()
.filter_map_ok(|entry| {
let path = entry.path();
(path.is_file() && path.extension().map(|ext| ext == "toml").unwrap_or(false))
.then(|| path.to_owned())
})
.collect::<Result<Vec<_>, _>>()?;
toml_paths.extend(paths);
}
toml_paths.sort_by_key(|path| {
path.file_name()
.expect("file name should not terminate in ..")
.to_owned()
});
Ok(toml_paths)
}
fn read_all_mappings() -> anyhow::Result<MappingList> {
let mut all_mappings = vec![];
for path in get_def_paths()? {
let toml_string = fs::read_to_string(path)?;
let mappings = toml::from_str::<MappingDefModel>(&toml_string)?.into_mapping_list();
all_mappings.extend(mappings.0);
}
let duplicates = all_mappings
.iter()
.duplicates_by(|(matcher, _)| matcher)
.collect_vec();
if !duplicates.is_empty() {
bail!("Rules with duplicate matchers found: {duplicates:?}");
}
Ok(MappingList(all_mappings))
}
/// Build the static syntax mappings defined in /src/syntax_mapping/builtins/
/// into a .rs source file, which is to be inserted with `include!`.
pub fn build_static_mappings() -> anyhow::Result<()> {
println!("cargo:rerun-if-changed=src/syntax_mapping/builtins/");
let mappings = read_all_mappings()?;
let codegen_path = Path::new(&env::var_os("OUT_DIR").ok_or(anyhow!("OUT_DIR is unset"))?)
.join("codegen_static_syntax_mappings.rs");
fs::write(codegen_path, mappings.codegen())?;
Ok(())
}

View File

@ -441,7 +441,7 @@ mod tests {
fn new() -> Self {
SyntaxDetectionTest {
assets: HighlightingAssets::from_binary(),
syntax_mapping: SyntaxMapping::builtin(),
syntax_mapping: SyntaxMapping::new(),
temp_dir: TempDir::new().expect("creation of temporary directory"),
}
}

View File

@ -121,7 +121,7 @@ impl App {
_ => unreachable!("other values for --paging are not allowed"),
};
let mut syntax_mapping = SyntaxMapping::builtin();
let mut syntax_mapping = SyntaxMapping::new();
if let Some(values) = self.matches.get_many::<String>("ignored-suffix") {
for suffix in values {
@ -130,7 +130,9 @@ impl App {
}
if let Some(values) = self.matches.get_many::<String>("map-syntax") {
for from_to in values {
// later args take precedence over earlier ones, hence `.rev()`
// see: https://github.com/sharkdp/bat/pull/2755#discussion_r1456416875
for from_to in values.rev() {
let parts: Vec<_> = from_to.split(':').collect();
if parts.len() != 2 {

View File

@ -78,9 +78,11 @@ fn run_cache_subcommand(
Ok(())
}
fn get_syntax_mapping_to_paths<'a>(
mappings: &[(GlobMatcher, MappingTarget<'a>)],
) -> HashMap<&'a str, Vec<String>> {
fn get_syntax_mapping_to_paths<'r, 't, I>(mappings: I) -> HashMap<&'t str, Vec<String>>
where
I: IntoIterator<Item = (&'r GlobMatcher, &'r MappingTarget<'t>)>,
't: 'r, // target text outlives rule
{
let mut map = HashMap::new();
for mapping in mappings {
if let (matcher, MappingTarget::MapTo(s)) = mapping {
@ -123,7 +125,7 @@ pub fn get_languages(config: &Config, cache_dir: &Path) -> Result<String> {
languages.sort_by_key(|lang| lang.name.to_uppercase());
let configured_languages = get_syntax_mapping_to_paths(config.syntax_mapping.mappings());
let configured_languages = get_syntax_mapping_to_paths(config.syntax_mapping.all_mappings());
for lang in &mut languages {
if let Some(additional_paths) = configured_languages.get(lang.name.as_str()) {

View File

@ -1,12 +1,23 @@
use std::path::Path;
use crate::error::Result;
use ignored_suffixes::IgnoredSuffixes;
use globset::{Candidate, GlobBuilder, GlobMatcher};
use crate::error::Result;
use builtin::BUILTIN_MAPPINGS;
use ignored_suffixes::IgnoredSuffixes;
mod builtin;
pub mod ignored_suffixes;
fn make_glob_matcher(from: &str) -> Result<GlobMatcher> {
let matcher = GlobBuilder::new(from)
.case_insensitive(true)
.literal_separator(true)
.build()?
.compile_matcher();
Ok(matcher)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum MappingTarget<'a> {
@ -29,204 +40,72 @@ pub enum MappingTarget<'a> {
#[derive(Debug, Clone, Default)]
pub struct SyntaxMapping<'a> {
mappings: Vec<(GlobMatcher, MappingTarget<'a>)>,
/// User-defined mappings at run time.
///
/// Rules in front have precedence.
custom_mappings: Vec<(GlobMatcher, MappingTarget<'a>)>,
pub(crate) ignored_suffixes: IgnoredSuffixes<'a>,
}
impl<'a> SyntaxMapping<'a> {
pub fn empty() -> SyntaxMapping<'a> {
pub fn new() -> SyntaxMapping<'a> {
Default::default()
}
pub fn builtin() -> SyntaxMapping<'a> {
let mut mapping = Self::empty();
mapping.insert("*.h", MappingTarget::MapTo("C++")).unwrap();
mapping
.insert(".clang-format", MappingTarget::MapTo("YAML"))
.unwrap();
mapping.insert("*.fs", MappingTarget::MapTo("F#")).unwrap();
mapping
.insert("build", MappingTarget::MapToUnknown)
.unwrap();
mapping
.insert("**/.ssh/config", MappingTarget::MapTo("SSH Config"))
.unwrap();
mapping
.insert(
"**/bat/config",
MappingTarget::MapTo("Bourne Again Shell (bash)"),
)
.unwrap();
mapping
.insert(
"/etc/profile",
MappingTarget::MapTo("Bourne Again Shell (bash)"),
)
.unwrap();
mapping
.insert(
"os-release",
MappingTarget::MapTo("Bourne Again Shell (bash)"),
)
.unwrap();
mapping
.insert("*.pac", MappingTarget::MapTo("JavaScript (Babel)"))
.unwrap();
mapping
.insert("fish_history", MappingTarget::MapTo("YAML"))
.unwrap();
for glob in ["*.jsonl", "*.sarif"] {
mapping.insert(glob, MappingTarget::MapTo("JSON")).unwrap();
}
// See #2151, https://nmap.org/book/nse-language.html
mapping
.insert("*.nse", MappingTarget::MapTo("Lua"))
.unwrap();
// See #1008
mapping
.insert("rails", MappingTarget::MapToUnknown)
.unwrap();
mapping
.insert("Containerfile", MappingTarget::MapTo("Dockerfile"))
.unwrap();
mapping
.insert("*.ksh", MappingTarget::MapTo("Bourne Again Shell (bash)"))
.unwrap();
// Nginx and Apache syntax files both want to style all ".conf" files
// see #1131 and #1137
mapping
.insert("*.conf", MappingTarget::MapExtensionToUnknown)
.unwrap();
for glob in &[
"/etc/nginx/**/*.conf",
"/etc/nginx/sites-*/**/*",
"nginx.conf",
"mime.types",
] {
mapping.insert(glob, MappingTarget::MapTo("nginx")).unwrap();
}
for glob in &[
"/etc/apache2/**/*.conf",
"/etc/apache2/sites-*/**/*",
"httpd.conf",
] {
mapping
.insert(glob, MappingTarget::MapTo("Apache Conf"))
.unwrap();
}
for glob in &[
"**/systemd/**/*.conf",
"**/systemd/**/*.example",
"*.automount",
"*.device",
"*.dnssd",
"*.link",
"*.mount",
"*.netdev",
"*.network",
"*.nspawn",
"*.path",
"*.service",
"*.scope",
"*.slice",
"*.socket",
"*.swap",
"*.target",
"*.timer",
] {
mapping.insert(glob, MappingTarget::MapTo("INI")).unwrap();
}
// unix mail spool
for glob in &["/var/spool/mail/*", "/var/mail/*"] {
mapping.insert(glob, MappingTarget::MapTo("Email")).unwrap()
}
// pacman hooks
mapping
.insert("*.hook", MappingTarget::MapTo("INI"))
.unwrap();
mapping
.insert("*.ron", MappingTarget::MapTo("Rust"))
.unwrap();
// Global git config files rooted in `$XDG_CONFIG_HOME/git/` or `$HOME/.config/git/`
// See e.g. https://git-scm.com/docs/git-config#FILES
match (
std::env::var_os("XDG_CONFIG_HOME").filter(|val| !val.is_empty()),
std::env::var_os("HOME")
.filter(|val| !val.is_empty())
.map(|home| Path::new(&home).join(".config")),
) {
(Some(xdg_config_home), Some(default_config_home))
if xdg_config_home == default_config_home => {
insert_git_config_global(&mut mapping, &xdg_config_home)
}
(Some(xdg_config_home), Some(default_config_home)) /* else guard */ => {
insert_git_config_global(&mut mapping, &xdg_config_home);
insert_git_config_global(&mut mapping, &default_config_home)
}
(Some(config_home), None) => insert_git_config_global(&mut mapping, &config_home),
(None, Some(config_home)) => insert_git_config_global(&mut mapping, &config_home),
(None, None) => (),
};
fn insert_git_config_global(mapping: &mut SyntaxMapping, config_home: impl AsRef<Path>) {
let git_config_path = config_home.as_ref().join("git");
mapping
.insert(
&git_config_path.join("config").to_string_lossy(),
MappingTarget::MapTo("Git Config"),
)
.ok();
mapping
.insert(
&git_config_path.join("ignore").to_string_lossy(),
MappingTarget::MapTo("Git Ignore"),
)
.ok();
mapping
.insert(
&git_config_path.join("attributes").to_string_lossy(),
MappingTarget::MapTo("Git Attributes"),
)
.ok();
}
mapping
}
pub fn insert(&mut self, from: &str, to: MappingTarget<'a>) -> Result<()> {
let glob = GlobBuilder::new(from)
.case_insensitive(true)
.literal_separator(true)
.build()?;
self.mappings.push((glob.compile_matcher(), to));
let matcher = make_glob_matcher(from)?;
self.custom_mappings.push((matcher, to));
Ok(())
}
pub fn mappings(&self) -> &[(GlobMatcher, MappingTarget<'a>)] {
&self.mappings
/// Returns an iterator over all mappings. User-defined mappings are listed
/// before builtin mappings; mappings in front have higher precedence.
///
/// Builtin mappings' `GlobMatcher`s are lazily compiled.
///
/// Note that this function only returns mappings that are valid under the
/// current environment. For details see [`Self::builtin_mappings`].
pub fn all_mappings(&self) -> impl Iterator<Item = (&GlobMatcher, &MappingTarget<'a>)> {
self.custom_mappings()
.iter()
.map(|(matcher, target)| (matcher, target)) // as_ref
.chain(
// we need a map with a closure to "do" the lifetime variance
// see: https://discord.com/channels/273534239310479360/1120124565591425034/1170543402870382653
// also, clippy false positive:
// see: https://github.com/rust-lang/rust-clippy/issues/9280
#[allow(clippy::map_identity)]
self.builtin_mappings().map(|rule| rule),
)
}
pub(crate) fn get_syntax_for(&self, path: impl AsRef<Path>) -> Option<MappingTarget<'a>> {
/// Returns an iterator over all valid builtin mappings. Mappings in front
/// have higher precedence.
///
/// The `GlabMatcher`s are lazily compiled.
///
/// Mappings that are invalid under the current environment (i.e. rule
/// requires environment variable(s) that is unset, or the joined string
/// after variable(s) replacement is not a valid glob expression) are
/// ignored.
pub fn builtin_mappings(
&self,
) -> impl Iterator<Item = (&'static GlobMatcher, &'static MappingTarget<'static>)> {
BUILTIN_MAPPINGS
.iter()
.filter_map(|(matcher, target)| matcher.as_ref().map(|glob| (glob, target)))
}
/// Returns all user-defined mappings.
pub fn custom_mappings(&self) -> &[(GlobMatcher, MappingTarget<'a>)] {
&self.custom_mappings
}
pub fn get_syntax_for(&self, path: impl AsRef<Path>) -> Option<MappingTarget<'a>> {
// Try matching on the file name as-is.
let candidate = Candidate::new(&path);
let candidate_filename = path.as_ref().file_name().map(Candidate::new);
for (ref glob, ref syntax) in self.mappings.iter().rev() {
for (glob, syntax) in self.all_mappings() {
if glob.is_match_candidate(&candidate)
|| candidate_filename
.as_ref()
@ -252,9 +131,46 @@ impl<'a> SyntaxMapping<'a> {
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic() {
let mut map = SyntaxMapping::empty();
fn builtin_mappings_work() {
let map = SyntaxMapping::new();
assert_eq!(
map.get_syntax_for("/path/to/build"),
Some(MappingTarget::MapToUnknown)
);
}
#[test]
fn all_fixed_builtin_mappings_can_compile() {
let map = SyntaxMapping::new();
// collect call evaluates all lazy closures
// fixed builtin mappings will panic if they fail to compile
let _mappings = map.builtin_mappings().collect::<Vec<_>>();
}
#[test]
fn builtin_mappings_matcher_only_compile_once() {
let map = SyntaxMapping::new();
let two_iterations: Vec<_> = (0..2)
.map(|_| {
// addresses of every matcher
map.builtin_mappings()
.map(|(matcher, _)| matcher as *const _ as usize)
.collect::<Vec<_>>()
})
.collect();
// if the matchers are only compiled once, their address should remain the same
assert_eq!(two_iterations[0], two_iterations[1]);
}
#[test]
fn custom_mappings_work() {
let mut map = SyntaxMapping::new();
map.insert("/path/to/Cargo.lock", MappingTarget::MapTo("TOML"))
.ok();
map.insert("/path/to/.ignore", MappingTarget::MapTo("Git Ignore"))
@ -273,52 +189,32 @@ mod tests {
}
#[test]
fn user_can_override_builtin_mappings() {
let mut map = SyntaxMapping::builtin();
fn custom_mappings_override_builtin() {
let mut map = SyntaxMapping::new();
assert_eq!(
map.get_syntax_for("/etc/profile"),
Some(MappingTarget::MapTo("Bourne Again Shell (bash)"))
map.get_syntax_for("/path/to/httpd.conf"),
Some(MappingTarget::MapTo("Apache Conf"))
);
map.insert("/etc/profile", MappingTarget::MapTo("My Syntax"))
map.insert("httpd.conf", MappingTarget::MapTo("My Syntax"))
.ok();
assert_eq!(
map.get_syntax_for("/etc/profile"),
map.get_syntax_for("/path/to/httpd.conf"),
Some(MappingTarget::MapTo("My Syntax"))
);
}
#[test]
fn builtin_mappings() {
let map = SyntaxMapping::builtin();
fn custom_mappings_precedence() {
let mut map = SyntaxMapping::new();
map.insert("/path/to/foo", MappingTarget::MapTo("alpha"))
.ok();
map.insert("/path/to/foo", MappingTarget::MapTo("bravo"))
.ok();
assert_eq!(
map.get_syntax_for("/path/to/build"),
Some(MappingTarget::MapToUnknown)
map.get_syntax_for("/path/to/foo"),
Some(MappingTarget::MapTo("alpha"))
);
}
#[test]
/// verifies that SyntaxMapping::builtin() doesn't repeat `Glob`-based keys
fn no_duplicate_builtin_keys() {
let mappings = SyntaxMapping::builtin().mappings;
for i in 0..mappings.len() {
let tail = mappings[i + 1..].into_iter();
let (dupl, _): (Vec<_>, Vec<_>) =
tail.partition(|item| item.0.glob() == mappings[i].0.glob());
// emit repeats on failure
assert_eq!(
dupl.len(),
0,
"Glob pattern `{}` mapped to multiple: {:?}",
mappings[i].0.glob().glob(),
{
let (_, mut dupl_targets): (Vec<GlobMatcher>, Vec<MappingTarget>) =
dupl.into_iter().cloned().unzip();
dupl_targets.push(mappings[i].1)
},
)
}
}
}

View File

@ -0,0 +1,91 @@
use std::env;
use globset::GlobMatcher;
use once_cell::sync::Lazy;
use crate::syntax_mapping::{make_glob_matcher, MappingTarget};
// Static syntax mappings generated from /src/syntax_mapping/builtins/ by the
// build script (/build/syntax_mapping.rs).
include!(concat!(
env!("OUT_DIR"),
"/codegen_static_syntax_mappings.rs"
));
// The defined matcher strings are analysed at compile time and converted into
// lazily-compiled `GlobMatcher`s. This is so that the string searches are moved
// from run time to compile time, thus improving startup performance.
//
// To any future maintainer (including possibly myself) wondering why there is
// not a `BuiltinMatcher` enum that looks like this:
//
// ```
// enum BuiltinMatcher {
// Fixed(&'static str),
// Dynamic(Lazy<Option<String>>),
// }
// ```
//
// Because there was. I tried it and threw it out.
//
// Naively looking at the problem from a distance, this may seem like a good
// design (strongly typed etc. etc.). It would also save on compiled size by
// extracting out common behaviour into functions. But while actually
// implementing the lazy matcher compilation logic, I realised that it's most
// convenient for `BUILTIN_MAPPINGS` to have the following type:
//
// `[(Lazy<Option<GlobMatcher>>, MappingTarget); N]`
//
// The benefit for this is that operations like listing all builtin mappings
// would be effectively memoised. The caller would not have to compile another
// `GlobMatcher` for rules that they have previously visited.
//
// Unfortunately, this means we are going to have to store a distinct closure
// for each rule anyway, which makes a `BuiltinMatcher` enum a pointless layer
// of indirection.
//
// In the current implementation, the closure within each generated rule simply
// calls either `build_matcher_fixed` or `build_matcher_dynamic`, depending on
// whether the defined matcher contains dynamic segments or not.
/// Compile a fixed glob string into a glob matcher.
///
/// A failure to compile is a fatal error.
///
/// Used internally by `Lazy<Option<GlobMatcher>>`'s lazy evaluation closure.
fn build_matcher_fixed(from: &str) -> GlobMatcher {
make_glob_matcher(from).expect("A builtin fixed glob matcher failed to compile")
}
/// Join a list of matcher segments to create a glob string, replacing all
/// environment variables, then compile to a glob matcher.
///
/// Returns `None` if any replacement fails, or if the joined glob string fails
/// to compile.
///
/// Used internally by `Lazy<Option<GlobMatcher>>`'s lazy evaluation closure.
fn build_matcher_dynamic(segs: &[MatcherSegment]) -> Option<GlobMatcher> {
// join segments
let mut buf = String::new();
for seg in segs {
match seg {
MatcherSegment::Text(s) => buf.push_str(s),
MatcherSegment::Env(var) => {
let replaced = env::var(var).ok()?;
buf.push_str(&replaced);
}
}
}
// compile glob matcher
let matcher = make_glob_matcher(&buf).ok()?;
Some(matcher)
}
/// A segment of a dynamic builtin matcher.
///
/// Used internally by `Lazy<Option<GlobMatcher>>`'s lazy evaluation closure.
#[derive(Clone, Debug)]
enum MatcherSegment {
Text(&'static str),
Env(&'static str),
}

View File

@ -0,0 +1,116 @@
# `/src/syntax_mapping/builtins`
The files in this directory define path/name-based syntax mappings, which amend
and take precedence over the extension/content-based syntax mappings provided by
[syntect](https://github.com/trishume/syntect).
## File organisation
Each TOML file should describe the syntax mappings of a single application, or
otherwise a set of logically-related rules.
What defines "a single application" here is deliberately vague, since the
file-splitting is purely for maintainability reasons. (Technically, we could
just as well use a single TOML file.) So just use common sense.
TOML files should reside in the corresponding subdirectory of the platform(s)
that they intend to target. At compile time, the build script will go through
each subdirectory that is applicable to the compilation target, collect the
syntax mappings defined by all TOML files, and embed them into the binary.
## File syntax
Each TOML file should contain a single section named `mappings`, with each of
its keys being a language identifier (first column of `bat -L`; also referred to
as "target").
The value of each key should be an array of strings, with each item being a glob
matcher. We will call each of these items a "rule".
For example, if `foo-application` uses both TOML and YAML configuration files,
we could write something like this:
```toml
# 30-foo-application.toml
[mappings]
"TOML" = [
# rules for TOML syntax go here
"/usr/share/foo-application/toml-config/*.conf",
"/etc/foo-application/toml-config/*.conf",
]
"YAML" = [
# rules for YAML syntax go here
# ...
]
```
### Dynamic environment variable replacement
In additional to the standard glob matcher syntax, rules also support dynamic
replacement of environment variables at runtime. This allows us to concisely
handle things like [XDG](https://specifications.freedesktop.org/basedir-spec/latest/).
All environment variables intended to be replaced at runtime must be enclosed in
`${}`, for example `"/foo/*/${YOUR_ENV}-suffix/*.log"`. Note that this is the
**only** admissible syntax; other variable substitution syntaxes are not
supported and will either cause a compile time error, or be treated as plain
text.
For example, if `foo-application` also supports per-user configuration files, we
could write something like this:
```toml
# 30-foo-application.toml
[mappings]
"TOML" = [
# rules for TOML syntax go here
"/usr/share/foo-application/toml-config/*.conf",
"/etc/foo-application/toml-config/*.conf",
"${XDG_CONFIG_HOME}/foo-application/toml-config/*.conf",
"${HOME}/.config/foo-application/toml-config/*.conf",
]
"YAML" = [
# rules for YAML syntax go here
# ...
]
```
If any environment variable replacement in a rule fails (for example when a
variable is unset), or if the glob string after replacements is invalid, the
entire rule will be ignored.
### Explicitly mapping to unknown
Sometimes it may be necessary to "unset" a particular syntect mapping - perhaps
a syntax's matching rules are "too greedy", and is claiming files that it should
not. In this case, there are two special identifiers:
`MappingTarget::MapToUnknown` and `MappingTarget::MapExtensionToUnknown`
(corresponding to the two variants of the `syntax_mapping::MappingTarget` enum).
An example of this would be `*.conf` files in general. So we may write something
like this:
```toml
# 99-unset-ambiguous-extensions.toml
[mappings]
"MappingTarget::MapExtensionToUnknown" = [
"*.conf",
]
```
## Ordering
At compile time, all TOML files applicable to the target are processed in
lexicographical filename order. So `00-foo.toml` takes precedence over
`10-bar.toml`, which takes precedence over `20-baz.toml`, and so on. Note that
**only** the filenames of the TOML files are taken into account; the
subdirectories they are placed in have no influence on ordering.
This behaviour can be occasionally useful for creating high/low priority rules,
such as in the aforementioned example of explicitly mapping `*.conf` files to
unknown. Generally this should not be much of a concern though, since rules
should be written as specifically as possible for each application.
Rules within each TOML file are processed (and therefore matched) in the order
in which they are defined. At runtime, the syntax selection algorithm will
short-circuit and return the target of the first matching rule.

View File

@ -0,0 +1,2 @@
[mappings]
"Bourne Again Shell (bash)" = ["/etc/os-release", "/var/run/os-release"]

View File

@ -0,0 +1,2 @@
[mappings]
"Apache Conf" = ["httpd.conf"]

View File

@ -0,0 +1,2 @@
[mappings]
"Bourne Again Shell (bash)" = ["**/bat/config"]

View File

@ -0,0 +1,2 @@
[mappings]
"Dockerfile" = ["Containerfile"]

View File

@ -0,0 +1,6 @@
[mappings]
"C++" = [
# probably better than the default Objective C mapping #877
"*.h",
]
"YAML" = [".clang-format"]

View File

@ -0,0 +1,2 @@
[mappings]
"F#" = ["*.fs"]

View File

@ -0,0 +1,10 @@
# Global git config files rooted in `$XDG_CONFIG_HOME/git/` or `$HOME/.config/git/`
# See e.g. https://git-scm.com/docs/git-config#FILES
[mappings]
"Git Config" = ["${XDG_CONFIG_HOME}/git/config", "${HOME}/.config/git/config"]
"Git Ignore" = ["${XDG_CONFIG_HOME}/git/ignore", "${HOME}/.config/git/ignore"]
"Git Attributes" = [
"${XDG_CONFIG_HOME}/git/attributes",
"${HOME}/.config/git/attributes",
]

View File

@ -0,0 +1,3 @@
# JSON Lines is a simple variation of JSON #2535
[mappings]
"JSON" = ["*.jsonl"]

View File

@ -0,0 +1,2 @@
[mappings]
"nginx" = ["nginx.conf", "mime.types"]

View File

@ -0,0 +1,3 @@
[mappings]
# See #2151, https://nmap.org/book/nse-language.html
"Lua" = ["*.nse"]

View File

@ -0,0 +1,3 @@
# 1515
[mappings]
"JavaScript (Babel)" = ["*.pac"]

View File

@ -0,0 +1,3 @@
# Rusty Object Notation #2427
[mappings]
"Rust" = ["*.ron"]

View File

@ -0,0 +1,3 @@
# SARIF is a format for reporting static analysis results #2695
[mappings]
"JSON" = ["*.sarif"]

View File

@ -0,0 +1,2 @@
[mappings]
"SSH Config" = ["**/.ssh/config"]

View File

@ -0,0 +1,5 @@
[mappings]
"MappingTarget::MapExtensionToUnknown" = [
# common extension used for all kinds of formats
"*.conf",
]

View File

@ -0,0 +1,7 @@
[mappings]
"MappingTarget::MapToUnknown" = [
# "NAnt Build File" should only match *.build files, not files named "build"
"build",
# "bin/rails" scripts in a Ruby project misidentified as HTML (Rails) #1008
"rails",
]

View File

@ -0,0 +1,7 @@
[mappings]
"Bourne Again Shell (bash)" = [
"/etc/os-release",
"/usr/lib/os-release",
"/etc/initrd-release",
"/usr/lib/extension-release.d/extension-release.*",
]

View File

@ -0,0 +1,3 @@
[mappings]
# pacman hooks
"INI" = ["/usr/share/libalpm/hooks/*.hook", "/etc/pacman.d/hooks/*.hook"]

View File

@ -0,0 +1,21 @@
[mappings]
"INI" = [
"**/systemd/**/*.conf",
"**/systemd/**/*.example",
"*.automount",
"*.device",
"*.dnssd",
"*.link",
"*.mount",
"*.netdev",
"*.network",
"*.nspawn",
"*.path",
"*.service",
"*.scope",
"*.slice",
"*.socket",
"*.swap",
"*.target",
"*.timer",
]

View File

@ -0,0 +1,2 @@
[mappings]
"Apache Conf" = ["/etc/apache2/**/*.conf", "/etc/apache2/sites-*/**/*"]

View File

@ -0,0 +1,2 @@
[mappings]
"YAML" = ["fish_history"]

View File

@ -0,0 +1,3 @@
# KornShell is backward-compatible with the Bourne shell #2633
[mappings]
"Bourne Again Shell (bash)" = ["*.ksh"]

View File

@ -0,0 +1,2 @@
[mappings]
"Email" = ["/var/spool/mail/*", "/var/mail/*"]

View File

@ -0,0 +1,2 @@
[mappings]
"nginx" = ["/etc/nginx/**/*.conf", "/etc/nginx/sites-*/**/*"]

View File

@ -0,0 +1,5 @@
[mappings]
"Bourne Again Shell (bash)" = [
# used by lots of shells
"/etc/profile",
]

View File

@ -9,6 +9,13 @@ if ! command -v hyperfine > /dev/null 2>&1; then
exit 1
fi
# Check that jq is installed.
if ! command -v jq > /dev/null 2>&1; then
echo "'jq' does not seem to be installed."
echo "You can get it here: https://jqlang.github.io/jq/download/"
exit 1
fi
# Check that python3 is installed.
if ! command -v python3 > /dev/null 2>&1; then
echo "'python3' does not seem to be installed."
@ -95,10 +102,20 @@ hyperfine \
cat "$RESULT_DIR/startup-time.md" >> "$REPORT"
heading "Startup time without syntax highlighting"
hyperfine \
"$(printf "%q" "$BAT") --no-config startup-time-src/small-CpuInfo-file.cpuinfo" \
--command-name "bat … small-CpuInfo-file.cpuinfo" \
--warmup "$WARMUP_COUNT" \
--runs "$RUN_COUNT" \
--export-markdown "$RESULT_DIR/startup-time-without-syntax-highlighting.md" \
--export-json "$RESULT_DIR/startup-time-without-syntax-highlighting.json"
cat "$RESULT_DIR/startup-time-without-syntax-highlighting.md" >> "$REPORT"
heading "Startup time with syntax highlighting"
hyperfine \
"$(printf "%q" "$BAT") --no-config --color=always startup-time-src/small-CpuInfo-file.cpuinfo" \
--command-name "bat … small-CpuInfo-file.cpuinfo" \
--command-name "bat … --color=always small-CpuInfo-file.cpuinfo" \
--warmup "$WARMUP_COUNT" \
--runs "$RUN_COUNT" \
--export-markdown "$RESULT_DIR/startup-time-with-syntax-highlighting.md" \
@ -117,6 +134,40 @@ hyperfine \
cat "$RESULT_DIR/startup-time-with-syntax-with-dependencies.md" >> "$REPORT"
heading "Startup time with indeterminant syntax"
hyperfine \
"$(printf "%q" "$BAT") --no-config --color=always startup-time-src/mystery-file" \
--shell none \
--command-name 'bat … mystery-file' \
--warmup "$WARMUP_COUNT" \
--runs "$RUN_COUNT" \
--export-markdown "$RESULT_DIR/startup-time-with-indeterminant-syntax.md" \
--export-json "$RESULT_DIR/startup-time-with-indeterminant-syntax.json"
cat "$RESULT_DIR/startup-time-with-indeterminant-syntax.md" >> "$REPORT"
heading "Startup time with manually set syntax"
hyperfine \
"$(printf "%q" "$BAT") --no-config --color=always --language=Dockerfile startup-time-src/mystery-file" \
--shell none \
--command-name 'bat … --language=Dockerfile mystery-file' \
--warmup "$WARMUP_COUNT" \
--runs "$RUN_COUNT" \
--export-markdown "$RESULT_DIR/startup-time-with-manually-set-syntax.md" \
--export-json "$RESULT_DIR/startup-time-with-manually-set-syntax.json"
cat "$RESULT_DIR/startup-time-with-manually-set-syntax.md" >> "$REPORT"
heading "Startup time with mapped syntax"
hyperfine \
"$(printf "%q" "$BAT") --no-config --color=always startup-time-src/Containerfile" \
--shell none \
--command-name 'bat … Containerfile' \
--warmup "$WARMUP_COUNT" \
--runs "$RUN_COUNT" \
--export-markdown "$RESULT_DIR/startup-time-with-mapped-syntax.md" \
--export-json "$RESULT_DIR/startup-time-with-mapped-syntax.json"
cat "$RESULT_DIR/startup-time-with-mapped-syntax.md" >> "$REPORT"
heading "Plain-text speed"
hyperfine \
"$(printf "%q" "$BAT") --no-config --language=txt --style=plain highlighting-speed-src/numpy_test_multiarray.py" \

View File

@ -0,0 +1,3 @@
FROM docker.io/alpine:latest
COPY foo /root/bar
RUN sleep 60

View File

@ -0,0 +1,3 @@
FROM docker.io/alpine:latest
COPY foo /root/bar
RUN sleep 60