[logfile] handle logs with ANSI escapes

This is a partial fix for handling ANSI escapes in parts of log messages that would prevent regexes from matching. Still more work to do. Related to #1057
2022-09-14 21:43:36 -07:00 · 2022-09-14 21:43:36 -07:00 · 740b827901
parent 8613ad4d47
commit 740b827901
18 changed files with 21358 additions and 20328 deletions
--- a/src/base/is_utf8.cc
+++ b/src/base/is_utf8.cc
@ -59,21 +59,26 @@
  `faulty_bytes` the number of actually existing bytes taking part in this
  error.
 */
-ssize_t
+utf8_scan_result
 is_utf8(const unsigned char* str,
        size_t len,
        const char** message,
        int* faulty_bytes,
        nonstd::optional<unsigned char> terminator)
 {
-    size_t i = 0;
+    bool has_ansi = false;
+    ssize_t i = 0;

    *message = nullptr;
    *faulty_bytes = 0;
    while (i < len) {
+        if (str[i] == '\x1b') {
+            has_ansi = true;
+        }
+
        if (terminator && str[i] == terminator.value()) {
            *message = nullptr;
-            return i;
+            return {i, has_ansi};
        }

        if (str[i] <= 0x7F) /* 00..7F */ {
@ -85,14 +90,14 @@ is_utf8(const unsigned char* str,
                        = "After a first byte between C2 and DF, expecting a "
                          "2nd byte between 80 and BF";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte between C2 and DF, expecting a 2nd "
                      "byte.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 2;
        } else if (str[i] == 0xE0) /* E0 A0..BF 80..BF */ {
@ -102,21 +107,21 @@ is_utf8(const unsigned char* str,
                        = "After a first byte of E0, expecting a 2nd byte "
                          "between A0 and BF.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte of E0, expecting a 3nd byte "
                          "between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte of E0, expecting two following "
                      "bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 3;
        } else if (str[i] >= 0xE1 && str[i] <= 0xEC) /* E1..EC 80..BF 80..BF */
@ -127,21 +132,21 @@ is_utf8(const unsigned char* str,
                        = "After a first byte between E1 and EC, expecting the "
                          "2nd byte between 80 and BF.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte between E1 and EC, expecting the "
                          "3rd byte between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte between E1 and EC, expecting two "
                      "following bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 3;
        } else if (str[i] == 0xED) /* ED 80..9F 80..BF */ {
@ -151,21 +156,21 @@ is_utf8(const unsigned char* str,
                        = "After a first byte of ED, expecting 2nd byte "
                          "between 80 and 9F.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte of ED, expecting 3rd byte "
                          "between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte of ED, expecting two following "
                      "bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 3;
        } else if (str[i] >= 0xEE && str[i] <= 0xEF) /* EE..EF 80..BF 80..BF */
@ -176,21 +181,21 @@ is_utf8(const unsigned char* str,
                        = "After a first byte between EE and EF, expecting 2nd "
                          "byte between 80 and BF.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte between EE and EF, expecting 3rd "
                          "byte between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte between EE and EF, two following "
                      "bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 3;
        } else if (str[i] == 0xF0) /* F0 90..BF 80..BF 80..BF */ {
@ -200,60 +205,61 @@ is_utf8(const unsigned char* str,
                        = "After a first byte of F0, expecting 2nd byte "
                          "between 90 and BF.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte of F0, expecting 3rd byte "
                          "between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) {
                    *message
                        = "After a first byte of F0, expecting 4th byte "
                          "between 80 and BF.";
                    *faulty_bytes = 4;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte of F0, expecting three following "
                      "bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 4;
        } else if (str[i] >= 0xF1
-                   && str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */ {
+                   && str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */
+        {
            if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ {
                if (str[i + 1] < 0x80 || str[i + 1] > 0xBF) {
                    *message
                        = "After a first byte of F1, F2, or F3, expecting a "
                          "2nd byte between 80 and BF.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte of F1, F2, or F3, expecting a "
                          "3rd byte between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) {
                    *message
                        = "After a first byte of F1, F2, or F3, expecting a "
                          "4th byte between 80 and BF.";
                    *faulty_bytes = 4;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte of F1, F2, or F3, expecting three "
                      "following bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 4;
        } else if (str[i] == 0xF4) /* F4 80..8F 80..BF 80..BF */ {
@ -263,36 +269,36 @@ is_utf8(const unsigned char* str,
                        = "After a first byte of F4, expecting 2nd byte "
                          "between 80 and 8F.";
                    *faulty_bytes = 2;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
                    *message
                        = "After a first byte of F4, expecting 3rd byte "
                          "between 80 and BF.";
                    *faulty_bytes = 3;
-                    return i;
+                    return {i, has_ansi};
                }
                if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) {
                    *message
                        = "After a first byte of F4, expecting 4th byte "
                          "between 80 and BF.";
                    *faulty_bytes = 4;
-                    return i;
+                    return {i, has_ansi};
                }
            } else {
                *message
                    = "After a first byte of F4, expecting three following "
                      "bytes.";
                *faulty_bytes = 1;
-                return i;
+                return {i, has_ansi};
            }
            i += 4;
        } else {
            *message
                = "Expecting bytes in the following ranges: 00..7F C2..F4.";
            *faulty_bytes = 1;
-            return i;
+            return {i, has_ansi};
        }
    }
-    return -1;
+    return {-1, has_ansi};
 }
--- a/src/base/is_utf8.hh
+++ b/src/base/is_utf8.hh
@ -33,10 +33,16 @@

 #include "optional.hpp"

-ssize_t is_utf8(const unsigned char* str,
+struct utf8_scan_result {
+    ssize_t usr_end{0};
+    bool usr_has_ansi{false};
+};
+
+utf8_scan_result is_utf8(const unsigned char* str,
                         size_t len,
                         const char** message,
                         int* faulty_bytes,
-                nonstd::optional<unsigned char> terminator = nonstd::nullopt);
+                         nonstd::optional<unsigned char> terminator
+                         = nonstd::nullopt);

 #endif /* _IS_UTF8_H */
--- a/src/base/string_util.cc
+++ b/src/base/string_util.cc
@ -45,14 +45,14 @@ scrub_to_utf8(char* buffer, size_t length)
    int faulty_bytes;

    while (true) {
-        ssize_t utf8_end
+        auto scan_res
            = is_utf8((unsigned char*) buffer, length, &msg, &faulty_bytes);

        if (msg == nullptr) {
            break;
        }
        for (int lpc = 0; lpc < faulty_bytes; lpc++) {
-            buffer[utf8_end + lpc] = '?';
+            buffer[scan_res.usr_end + lpc] = '?';
        }
    }
 }
--- a/src/config.cmake.h.in
+++ b/src/config.cmake.h.in
@ -6,7 +6,7 @@

 #define HAVE_LIBCURL

-#cmakedefine SIZEOF_OFF_T @SIZEOF_OFF_T @
+#cmakedefine SIZEOF_OFF_T @SIZEOF_OFF_T@

 #cmakedefine VCS_PACKAGE_STRING "@VCS_PACKAGE_STRING@"

--- a/src/data_parser.cc
+++ b/src/data_parser.cc
@ -90,7 +90,7 @@ data_parser::pairup(data_parser::schema_id_t* schema,
        } else if (iter->e_token == in_list.el_format.df_qualifier) {
            value.SPLICE(
                value.end(), key_comps, key_comps.begin(), key_comps.end());
-            strip(value, element_if(DT_WHITE));
+            strip(value, element_is_space{});
            if (!value.empty()) {
                el_stack.PUSH_BACK(element(value, DNT_VALUE));
            }
@ -119,7 +119,7 @@ data_parser::pairup(data_parser::schema_id_t* schema,
                                     key_comps.begin(),
                                     key_iter);
                        key_comps.POP_FRONT();
-                        strip(key_comps, element_if(DT_WHITE));
+                        strip(key_comps, element_is_space{});
                        if (key_comps.empty()) {
                            key_iter = key_comps.end();
                        } else {
@ -160,12 +160,12 @@ data_parser::pairup(data_parser::schema_id_t* schema,
                key_comps.resize(1);
            }

-            strip(value, element_if(DT_WHITE));
+            strip(value, element_is_space{});
            value.remove_if(element_if(DT_COMMA));
            if (!value.empty()) {
                el_stack.PUSH_BACK(element(value, DNT_VALUE));
            }
-            strip(key_comps, element_if(DT_WHITE));
+            strip(key_comps, element_is_space{});
            if (!key_comps.empty()) {
                if (key_is_values) {
                    el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
@ -531,8 +531,8 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
    key_comps.remove_if(element_if(DT_COMMA));
    value.remove_if(element_if(in_list.el_format.df_terminator));
    value.remove_if(element_if(DT_COMMA));
-    strip(key_comps, element_if(DT_WHITE));
-    strip(value, element_if(DT_WHITE));
+    strip(key_comps, element_is_space{});
+    strip(value, element_is_space{});
    if ((el_stack.empty() || el_stack.back().e_token != DNT_KEY)
        && value.empty() && key_comps.size() > 1
        && (key_comps.front().e_token == DT_WORD
@ -550,7 +550,9 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
                if (found_value) {
                    key_end = key_comps.begin();
                }
-            } else if (key_iter->e_token == DT_WHITE) {
+            } else if (key_iter->e_token == DT_WHITE
+                       || key_iter->e_token == DT_CSI)
+            {
            } else {
                if (!found_value) {
                    key_end = key_iter;
@ -562,7 +564,7 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
            key_end = key_comps.begin();
        }
        value.SPLICE(value.end(), key_comps, key_end, key_comps.end());
-        strip(key_comps, element_if(DT_WHITE));
+        strip(key_comps, element_is_space{});
        if (!key_comps.empty()) {
            el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
        }
@ -571,9 +573,9 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
        value.SPLICE(
            value.end(), key_comps, key_comps.begin(), key_comps.end());
    }
-    strip(value, element_if(DT_WHITE));
+    strip(value, element_is_space{});
    strip(value, element_if(DT_COLON));
-    strip(value, element_if(DT_WHITE));
+    strip(value, element_is_space{});
    if (!value.empty()) {
        if (value.size() == 2 && value.back().e_token == DNT_GROUP) {
            element_list_t ELEMENT_LIST_T(group_pair);
@ -681,6 +683,7 @@ dfs_prefix_next(data_format_state_t state, data_token_t next_token)
                case DT_HEX_NUMBER:
                case DT_NUMBER:
                case DT_WHITE:
+                case DT_CSI:
                case DT_LSQUARE:
                case DT_RSQUARE:
                case DT_LANGLE:
--- a/src/data_parser.hh
+++ b/src/data_parser.hh
@ -77,7 +77,9 @@ struct data_format {
                data_token_t terminator = DT_INVALID) noexcept
        : df_name(name), df_appender(appender), df_terminator(terminator),
          df_qualifier(DT_INVALID), df_separator(DT_COLON),
-          df_prefix_terminator(DT_INVALID){};
+          df_prefix_terminator(DT_INVALID)
+    {
+    }

    const char* df_name;
    data_token_t df_appender;
@ -234,7 +236,7 @@ public:
            int group_depth = -1;

            LIST_INIT_TRACE;
-        };
+        }

        element_list_t(const element_list_t& other) : std::list<element>(other)
        {
@ -247,7 +249,7 @@ public:
            int line = __LINE__;

            LIST_DEINIT_TRACE;
-        };
+        }

        void push_front(const element& elem, const char* fn, int line)
        {
@ -255,7 +257,7 @@ public:

            require(elem.e_capture.c_end >= -1);
            this->std::list<element>::push_front(elem);
-        };
+        }

        void push_back(const element& elem, const char* fn, int line)
        {
@ -263,28 +265,28 @@ public:

            require(elem.e_capture.c_end >= -1);
            this->std::list<element>::push_back(elem);
-        };
+        }

        void pop_front(const char* fn, int line)
        {
            LIST_TRACE;

            this->std::list<element>::pop_front();
-        };
+        }

        void pop_back(const char* fn, int line)
        {
            LIST_TRACE;

            this->std::list<element>::pop_back();
-        };
+        }

        void clear2(const char* fn, int line)
        {
            LIST_TRACE;

            this->std::list<element>::clear();
-        };
+        }

        void swap(element_list_t& other, const char* fn, int line)
        {
@ -345,26 +347,33 @@ public:
        bool operator()(data_token_t token, const element& elem) const
        {
            return token == elem.e_token || token == DT_ANY;
-        };
+        }

        bool operator()(const element& elem, data_token_t token) const
        {
            return (*this)(token, elem);
-        };
+        }
    };

    struct element_if {
-        element_if(data_token_t token) : ei_token(token){};
+        element_if(data_token_t token) : ei_token(token) {}

        bool operator()(const element& a) const
        {
            return a.e_token == this->ei_token;
-        };
+        }

    private:
        data_token_t ei_token;
    };

+    struct element_is_space {
+        bool operator()(const element& el) const
+        {
+            return el.e_token == DT_WHITE || el.e_token == DT_CSI;
+        }
+    };
+
    struct discover_format_state {
        discover_format_state();

@ -418,4 +427,5 @@ public:
 private:
    data_scanner* dp_scanner;
 };
+
 #endif
--- a/src/data_scanner.cc
+++ b/src/data_scanner.cc
@ -185,6 +185,9 @@ static struct {
    {
        "escc",
    },
+    {
+        "csi",
+    },

    {
        "gbg",
--- a/src/data_scanner.hh
+++ b/src/data_scanner.hh
@ -93,6 +93,7 @@ enum data_token_t {
    DT_WHITE,
    DT_DOT,
    DT_ESCAPED_CHAR,
+    DT_CSI,

    DT_GARBAGE,

--- a/src/data_scanner_re.cc
+++ b/src/data_scanner_re.cc
--- a/src/data_scanner_re.re
+++ b/src/data_scanner_re.re
@ -114,6 +114,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()

       SPACE = [ \t\r];
       ALPHA = [a-zA-Z];
+       ESC = "\x1b";
       NUM = [0-9];
       ALPHANUM = [a-zA-Z0-9_];
       EOF = "\x00";
@ -137,7 +138,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()

       EOF { return nonstd::nullopt; }

-       ("u"|"r")?'"'('\\'.|[^\x00"\\]|'""')*'"' {
+       ("u"|"r")?'"'('\\'.|[^\x00\x1b"\\]|'""')*'"' {
           CAPTURE(DT_QUOTED_STRING);
           switch (this->ds_input[cap_inner.c_begin]) {
           case 'u':
@ -152,7 +153,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
       [a-qstv-zA-QSTV-Z]"'" {
           CAPTURE(DT_WORD);
       }
-       ("u"|"r")?"'"('\\'.|"''"|[^\x00'\\])*"'"/[^sS] {
+       ("u"|"r")?"'"('\\'.|"''"|[^\x00\x1b'\\])*"'"/[^sS] {
           CAPTURE(DT_QUOTED_STRING);
           switch (this->ds_input[cap_inner.c_begin]) {
           case 'u':
@ -164,7 +165,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
           cap_inner.c_end -= 1;
           return tokenize_result{token_out, cap_all, cap_inner, this->ds_input.data()};
       }
-       [a-zA-Z0-9]+":/""/"?[^\x00\r\n\t '"[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); }
+       [a-zA-Z0-9]+":/""/"?[^\x00\x1b\r\n\t '"[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); }
       ("/"|"./"|"../"|[A-Z]":\\"|"\\\\")("Program Files"(" (x86)")?)?[a-zA-Z0-9_\.\-\~/\\!@#$%^&*()]* { RET(DT_PATH); }
       (SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); }
       (SPACE|NUM)NUM?":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); }
@ -200,6 +201,10 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
           RET(DT_H1);
       }

+       ESC"["[0-9=;?]*[a-zA-Z] {
+           RET(DT_CSI);
+       }
+
       ":" { RET(DT_COLON); }
       "=" { RET(DT_EQUALS); }
       "," { RET(DT_COMMA); }
@ -237,7 +242,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()

       ("re-")?[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); }

-       [^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\.\\][^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]*("::"[^\x00"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* {
+       [^\x00\x1b"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\.\\][^\x00\x1b"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]*("::"[^\x00\x1b"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* {
           RET(DT_SYMBOL);
       }

--- a/src/line_buffer.cc
+++ b/src/line_buffer.cc
@ -651,21 +651,22 @@ line_buffer::load_next_buffer()

            auto before = line_start - this->lb_alt_buffer->begin();
            auto remaining = this->lb_alt_buffer.value().size() - before;
-            auto utf8_end = is_utf8((unsigned char*) line_start,
+            auto utf_scan_res = is_utf8((unsigned char*) line_start,
                                        remaining,
                                        &msg,
                                        &faulty_bytes,
                                        '\n');
            if (msg != nullptr) {
                lf = (char*) memchr(line_start, '\n', remaining);
-                utf8_end = lf - line_start;
+                utf_scan_res.usr_end = lf - line_start;
                valid_utf = false;
            }
-            if (utf8_end >= 0) {
-                lf = line_start + utf8_end;
+            if (utf_scan_res.usr_end >= 0) {
+                lf = line_start + utf_scan_res.usr_end;
            }
            this->lb_alt_line_starts.emplace_back(before);
            this->lb_alt_line_is_utf.emplace_back(valid_utf);
+            this->lb_alt_line_has_ansi.emplace_back(utf_scan_res.usr_has_ansi);

            if (lf != nullptr) {
                line_start = lf + 1;
@ -727,6 +728,8 @@ line_buffer::fill_range(file_off_t start, ssize_t max_length)
        this->lb_alt_line_starts.clear();
        this->lb_line_is_utf = std::move(this->lb_alt_line_is_utf);
        this->lb_alt_line_is_utf.clear();
+        this->lb_line_has_ansi = std::move(this->lb_alt_line_has_ansi);
+        this->lb_alt_line_has_ansi.clear();
        this->lb_stats.s_used_preloads += 1;
    }
    if (this->in_range(start) && this->in_range(start + max_length - 1)) {
@ -1045,7 +1048,7 @@ line_buffer::load_next_line(file_range prev_line)
            const char* msg;
            int faulty_bytes;

-            utf8_end = is_utf8((unsigned char*) line_start,
+            auto scan_res = is_utf8((unsigned char*) line_start,
                                    retval.li_file_range.fr_size,
                                    &msg,
                                    &faulty_bytes,
@ -1055,7 +1058,10 @@ line_buffer::load_next_line(file_range prev_line)
                    line_start, '\n', retval.li_file_range.fr_size);
                utf8_end = lf - line_start;
                retval.li_valid_utf = false;
+            } else {
+                utf8_end = scan_res.usr_end;
            }
+            retval.li_has_ansi = scan_res.usr_has_ansi;
        }

        if (utf8_end >= 0) {
--- a/src/line_buffer.hh
+++ b/src/line_buffer.hh
@ -54,6 +54,7 @@ struct line_info {
    file_range li_file_range;
    bool li_partial{false};
    bool li_valid_utf{true};
+    bool li_has_ansi{false};
 };

 /**
@ -320,6 +321,7 @@ private:
    nonstd::optional<auto_buffer> lb_alt_buffer;
    std::vector<uint32_t> lb_alt_line_starts;
    std::vector<bool> lb_alt_line_is_utf;
+    std::vector<bool> lb_alt_line_has_ansi;
    std::future<bool> lb_loader_future;
    nonstd::optional<file_off_t> lb_loader_file_offset;

@ -342,6 +344,7 @@ private:

    std::vector<uint32_t> lb_line_starts;
    std::vector<bool> lb_line_is_utf;
+    std::vector<bool> lb_line_has_ansi;
    stats lb_stats;

    nonstd::optional<auto_fd> lb_cached_fd;
--- a/src/log_data_helper.cc
+++ b/src/log_data_helper.cc
@ -129,7 +129,7 @@ log_data_helper::parse_line(content_line_t line, bool allow_middle)
                        pugi::xpath_query query("//*");
                        auto node_set = doc.select_nodes(query);

-                        for (auto& xpath_node : node_set) {
+                        for (const auto& xpath_node : node_set) {
                            auto node_path = lnav::pugixml::get_actual_path(
                                xpath_node.node());
                            for (auto& attr : xpath_node.node().attributes()) {
@ -175,10 +175,8 @@ log_data_helper::get_line_bounds(size_t& line_index_out,

    line_end_index_out = 0;
    do {
-        const char* line_end;
-
        line_index_out = line_end_index_out;
-        line_end = (const char*) memchr(
+        const auto* line_end = (const char*) memchr(
            this->ldh_line_values.lvv_sbr.get_data() + line_index_out + 1,
            '\n',
            this->ldh_line_values.lvv_sbr.length() - line_index_out - 1);
--- a/src/log_format_fwd.hh
+++ b/src/log_format_fwd.hh
@ -194,6 +194,10 @@ public:

    bool is_valid_utf() const { return this->ll_valid_utf; }

+    void set_has_ansi(bool v) { this->ll_has_ansi = v; }
+
+    bool has_ansi() const { return this->ll_has_ansi; }
+
    /** @param l The logging level. */
    void set_level(log_level_t l) { this->ll_level = l; };

@ -293,7 +297,8 @@ public:
    }

 private:
-    file_off_t ll_offset;
+    file_off_t ll_offset : 63;
+    uint8_t ll_has_ansi : 1;
    time_t ll_time;
    unsigned int ll_millis : 10;
    unsigned int ll_opid : 6;
--- a/src/logfile.cc
+++ b/src/logfile.cc
@ -42,6 +42,7 @@
 #include <sys/stat.h>
 #include <time.h>

+#include "base/ansi_scrubber.hh"
 #include "base/fs_util.hh"
 #include "base/injector.hh"
 #include "base/string_util.hh"
@ -309,6 +310,7 @@ logfile::process_prefix(shared_buffer_ref& sbr,
        case log_format::SCAN_MATCH: {
            if (!this->lf_index.empty()) {
                this->lf_index.back().set_valid_utf(li.li_valid_utf);
+                this->lf_index.back().set_has_ansi(li.li_has_ansi);
            }
            if (prescan_size > 0 && this->lf_index.size() >= prescan_size
                && prescan_time != this->lf_index[prescan_size - 1].get_time())
@ -369,6 +371,7 @@ logfile::process_prefix(shared_buffer_ref& sbr,
                                        last_mod,
                                        last_opid);
            this->lf_index.back().set_valid_utf(li.li_valid_utf);
+            this->lf_index.back().set_has_ansi(li.li_has_ansi);
            break;
        }
        case log_format::SCAN_INCOMPLETE:
@ -582,6 +585,17 @@ logfile::rebuild_index(nonstd::optional<ui_clock::time_point> deadline)

            auto sbr = read_result.unwrap();
            sbr.rtrim(is_line_ending);
+
+            if (li.li_has_ansi) {
+                auto tmp_line = sbr.to_string_fragment().to_string();
+
+                scrub_ansi_string(tmp_line, nullptr);
+                memcpy(sbr.get_writable_data(),
+                       tmp_line.c_str(),
+                       tmp_line.length());
+                sbr.narrow(0, tmp_line.length());
+            }
+
            this->lf_longest_line
                = std::max(this->lf_longest_line, sbr.length());
            this->lf_partial_line = li.li_partial;
--- a/src/logfile_sub_source.cc
+++ b/src/logfile_sub_source.cc
@ -197,6 +197,9 @@ logfile_sub_source::text_value_for_line(textview_curses& tc,
            = this->lss_token_file->read_line(this->lss_token_line)
                  .map([](auto sbr) { return to_string(sbr); })
                  .unwrapOr({});
+        if (this->lss_token_line->has_ansi()) {
+            scrub_ansi_string(this->lss_token_value, &this->lss_token_attrs);
+        }
    }
    this->lss_token_shift_start = 0;
    this->lss_token_shift_size = 0;
--- a/src/md4cpp.cc
+++ b/src/md4cpp.cc
@ -267,14 +267,14 @@ parse(const string_fragment& sf, event_handler& eh)
    const char* utf8_errmsg = nullptr;
    int utf8_faulty_bytes = 0;

-    auto utf8_erroff = is_utf8((unsigned char*) sf.data(),
+    auto scan_res = is_utf8((unsigned char*) sf.data(),
                            sf.length(),
                            &utf8_errmsg,
                            &utf8_faulty_bytes);
    if (utf8_errmsg != nullptr) {
        return Err(
            fmt::format(FMT_STRING("file has invalid UTF-8 at offset {}: {}"),
-                        utf8_erroff,
+                        scan_res.usr_end,
                        utf8_errmsg));
    }

--- a/test/lnav_doctests.cc
+++ b/test/lnav_doctests.cc
@ -31,6 +31,7 @@

 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
 #include "byte_array.hh"
+#include "data_scanner.hh"
 #include "doctest/doctest.h"
 #include "lnav_config.hh"
 #include "lnav_util.hh"
@ -148,10 +149,7 @@ class my_path_source : public unique_path_source {
 public:
    explicit my_path_source(ghc::filesystem::path p) : mps_path(std::move(p)) {}

-    ghc::filesystem::path get_path() const override
-    {
-        return this->mps_path;
-    }
+    ghc::filesystem::path get_path() const override { return this->mps_path; }

    ghc::filesystem::path mps_path;
 };
@ -214,3 +212,20 @@ TEST_CASE("user_message to json")

    CHECK(json == json2);
 }
+
+TEST_CASE("data_scanner CSI")
+{
+    static const char INPUT[] = "\x1b[32mHello\x1b[0m";
+
+    data_scanner ds(string_fragment::from_const(INPUT));
+
+    auto tok_res = ds.tokenize2();
+    CHECK(tok_res->tr_token == DT_CSI);
+    CHECK(tok_res->to_string() == "\x1b[32m");
+    tok_res = ds.tokenize2();
+    CHECK(tok_res->tr_token == DT_SYMBOL);
+    CHECK(tok_res->to_string() == "Hello");
+    tok_res = ds.tokenize2();
+    CHECK(tok_res->tr_token == DT_CSI);
+    CHECK(tok_res->to_string() == "\x1b[0m");
+}