From 609f1adf909b666592d3079490cb3094d3dc031e Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Sun, 9 Jun 2024 22:57:00 -0600 Subject: [PATCH] Fix unicode encoding of hyperlinks The problem, is I based the code on the implementation in ripgrep. But while ripgrep is writing directly to the stream, I am using a Formatter, which means I have to write characters, not raw bytes. Thus we need to percent encode all non-ascii bytes (or we could switch to writing bytes directly, but that would be more complicated, and I think percent encoding is safer anyway). --- src/hyperlink.rs | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/hyperlink.rs b/src/hyperlink.rs index d27f7b4..4e6ee8f 100644 --- a/src/hyperlink.rs +++ b/src/hyperlink.rs @@ -26,17 +26,16 @@ impl fmt::Display for PathUrl { } fn encode(f: &mut Formatter, byte: u8) -> fmt::Result { + // NOTE: + // Most terminals can handle non-ascii unicode characters in a file url fine. But on some OSes (notably + // windows), the encoded bytes of the path may not be valid UTF-8. Since we don't know if a + // byte >= 128 is part of a valid UTF-8 encoding or not, we just percent encode any non-ascii + // byte. + // Percent encoding these bytes is probably safer anyway. match byte { - b'0'..=b'9' - | b'A'..=b'Z' - | b'a'..=b'z' - | b'/' - | b':' - | b'-' - | b'.' - | b'_' - | b'~' - | 128.. => f.write_char(byte.into()), + b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'/' | b':' | b'-' | b'.' | b'_' | b'~' => { + f.write_char(byte.into()) + } #[cfg(windows)] b'\\' => f.write_char('/'), _ => { @@ -61,3 +60,21 @@ fn host() -> &'static str { const fn host() -> &'static str { "" } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_unicode_encoding() { + let path: PathBuf = "/$*\x1bßé/∫😃".into(); + let url = PathUrl::new(&path).unwrap(); + assert_eq!( + url.to_string(), + format!( + "file://{}/%24%2A%1B%C3%9F%C3%A9/%E2%88%AB%F0%9F%98%83", + host() + ), + ); + } +}