[logfile] handle logs with ANSI escapes

This is a partial fix for handling ANSI escapes in
parts of log messages that would prevent regexes
from matching.  Still more work to do.

Related to #1057
This commit is contained in:
Tim Stack 2022-09-14 21:43:36 -07:00
parent 8613ad4d47
commit 740b827901
18 changed files with 21358 additions and 20328 deletions

View File

@ -59,21 +59,26 @@
`faulty_bytes` the number of actually existing bytes taking part in this
error.
*/
ssize_t
utf8_scan_result
is_utf8(const unsigned char* str,
size_t len,
const char** message,
int* faulty_bytes,
nonstd::optional<unsigned char> terminator)
{
size_t i = 0;
bool has_ansi = false;
ssize_t i = 0;
*message = nullptr;
*faulty_bytes = 0;
while (i < len) {
if (str[i] == '\x1b') {
has_ansi = true;
}
if (terminator && str[i] == terminator.value()) {
*message = nullptr;
return i;
return {i, has_ansi};
}
if (str[i] <= 0x7F) /* 00..7F */ {
@ -85,14 +90,14 @@ is_utf8(const unsigned char* str,
= "After a first byte between C2 and DF, expecting a "
"2nd byte between 80 and BF";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte between C2 and DF, expecting a 2nd "
"byte.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 2;
} else if (str[i] == 0xE0) /* E0 A0..BF 80..BF */ {
@ -102,21 +107,21 @@ is_utf8(const unsigned char* str,
= "After a first byte of E0, expecting a 2nd byte "
"between A0 and BF.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte of E0, expecting a 3nd byte "
"between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte of E0, expecting two following "
"bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 3;
} else if (str[i] >= 0xE1 && str[i] <= 0xEC) /* E1..EC 80..BF 80..BF */
@ -127,21 +132,21 @@ is_utf8(const unsigned char* str,
= "After a first byte between E1 and EC, expecting the "
"2nd byte between 80 and BF.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte between E1 and EC, expecting the "
"3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte between E1 and EC, expecting two "
"following bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 3;
} else if (str[i] == 0xED) /* ED 80..9F 80..BF */ {
@ -151,21 +156,21 @@ is_utf8(const unsigned char* str,
= "After a first byte of ED, expecting 2nd byte "
"between 80 and 9F.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte of ED, expecting 3rd byte "
"between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte of ED, expecting two following "
"bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 3;
} else if (str[i] >= 0xEE && str[i] <= 0xEF) /* EE..EF 80..BF 80..BF */
@ -176,21 +181,21 @@ is_utf8(const unsigned char* str,
= "After a first byte between EE and EF, expecting 2nd "
"byte between 80 and BF.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte between EE and EF, expecting 3rd "
"byte between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte between EE and EF, two following "
"bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 3;
} else if (str[i] == 0xF0) /* F0 90..BF 80..BF 80..BF */ {
@ -200,60 +205,61 @@ is_utf8(const unsigned char* str,
= "After a first byte of F0, expecting 2nd byte "
"between 90 and BF.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte of F0, expecting 3rd byte "
"between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) {
*message
= "After a first byte of F0, expecting 4th byte "
"between 80 and BF.";
*faulty_bytes = 4;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte of F0, expecting three following "
"bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 4;
} else if (str[i] >= 0xF1
&& str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */ {
&& str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */
{
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ {
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF) {
*message
= "After a first byte of F1, F2, or F3, expecting a "
"2nd byte between 80 and BF.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte of F1, F2, or F3, expecting a "
"3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) {
*message
= "After a first byte of F1, F2, or F3, expecting a "
"4th byte between 80 and BF.";
*faulty_bytes = 4;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte of F1, F2, or F3, expecting three "
"following bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 4;
} else if (str[i] == 0xF4) /* F4 80..8F 80..BF 80..BF */ {
@ -263,36 +269,36 @@ is_utf8(const unsigned char* str,
= "After a first byte of F4, expecting 2nd byte "
"between 80 and 8F.";
*faulty_bytes = 2;
return i;
return {i, has_ansi};
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF) {
*message
= "After a first byte of F4, expecting 3rd byte "
"between 80 and BF.";
*faulty_bytes = 3;
return i;
return {i, has_ansi};
}
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF) {
*message
= "After a first byte of F4, expecting 4th byte "
"between 80 and BF.";
*faulty_bytes = 4;
return i;
return {i, has_ansi};
}
} else {
*message
= "After a first byte of F4, expecting three following "
"bytes.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
i += 4;
} else {
*message
= "Expecting bytes in the following ranges: 00..7F C2..F4.";
*faulty_bytes = 1;
return i;
return {i, has_ansi};
}
}
return -1;
return {-1, has_ansi};
}

View File

@ -33,10 +33,16 @@
#include "optional.hpp"
ssize_t is_utf8(const unsigned char* str,
size_t len,
const char** message,
int* faulty_bytes,
nonstd::optional<unsigned char> terminator = nonstd::nullopt);
struct utf8_scan_result {
ssize_t usr_end{0};
bool usr_has_ansi{false};
};
utf8_scan_result is_utf8(const unsigned char* str,
size_t len,
const char** message,
int* faulty_bytes,
nonstd::optional<unsigned char> terminator
= nonstd::nullopt);
#endif /* _IS_UTF8_H */

View File

@ -45,14 +45,14 @@ scrub_to_utf8(char* buffer, size_t length)
int faulty_bytes;
while (true) {
ssize_t utf8_end
auto scan_res
= is_utf8((unsigned char*) buffer, length, &msg, &faulty_bytes);
if (msg == nullptr) {
break;
}
for (int lpc = 0; lpc < faulty_bytes; lpc++) {
buffer[utf8_end + lpc] = '?';
buffer[scan_res.usr_end + lpc] = '?';
}
}
}

View File

@ -6,7 +6,7 @@
#define HAVE_LIBCURL
#cmakedefine SIZEOF_OFF_T @SIZEOF_OFF_T @
#cmakedefine SIZEOF_OFF_T @SIZEOF_OFF_T@
#cmakedefine VCS_PACKAGE_STRING "@VCS_PACKAGE_STRING@"

View File

@ -90,7 +90,7 @@ data_parser::pairup(data_parser::schema_id_t* schema,
} else if (iter->e_token == in_list.el_format.df_qualifier) {
value.SPLICE(
value.end(), key_comps, key_comps.begin(), key_comps.end());
strip(value, element_if(DT_WHITE));
strip(value, element_is_space{});
if (!value.empty()) {
el_stack.PUSH_BACK(element(value, DNT_VALUE));
}
@ -119,7 +119,7 @@ data_parser::pairup(data_parser::schema_id_t* schema,
key_comps.begin(),
key_iter);
key_comps.POP_FRONT();
strip(key_comps, element_if(DT_WHITE));
strip(key_comps, element_is_space{});
if (key_comps.empty()) {
key_iter = key_comps.end();
} else {
@ -160,12 +160,12 @@ data_parser::pairup(data_parser::schema_id_t* schema,
key_comps.resize(1);
}
strip(value, element_if(DT_WHITE));
strip(value, element_is_space{});
value.remove_if(element_if(DT_COMMA));
if (!value.empty()) {
el_stack.PUSH_BACK(element(value, DNT_VALUE));
}
strip(key_comps, element_if(DT_WHITE));
strip(key_comps, element_is_space{});
if (!key_comps.empty()) {
if (key_is_values) {
el_stack.PUSH_BACK(element(key_comps, DNT_VALUE));
@ -531,8 +531,8 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
key_comps.remove_if(element_if(DT_COMMA));
value.remove_if(element_if(in_list.el_format.df_terminator));
value.remove_if(element_if(DT_COMMA));
strip(key_comps, element_if(DT_WHITE));
strip(value, element_if(DT_WHITE));
strip(key_comps, element_is_space{});
strip(value, element_is_space{});
if ((el_stack.empty() || el_stack.back().e_token != DNT_KEY)
&& value.empty() && key_comps.size() > 1
&& (key_comps.front().e_token == DT_WORD
@ -550,7 +550,9 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
if (found_value) {
key_end = key_comps.begin();
}
} else if (key_iter->e_token == DT_WHITE) {
} else if (key_iter->e_token == DT_WHITE
|| key_iter->e_token == DT_CSI)
{
} else {
if (!found_value) {
key_end = key_iter;
@ -562,7 +564,7 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
key_end = key_comps.begin();
}
value.SPLICE(value.end(), key_comps, key_end, key_comps.end());
strip(key_comps, element_if(DT_WHITE));
strip(key_comps, element_is_space{});
if (!key_comps.empty()) {
el_stack.PUSH_BACK(element(key_comps, DNT_KEY, false));
}
@ -571,9 +573,9 @@ data_parser::end_of_value(data_parser::element_list_t& el_stack,
value.SPLICE(
value.end(), key_comps, key_comps.begin(), key_comps.end());
}
strip(value, element_if(DT_WHITE));
strip(value, element_is_space{});
strip(value, element_if(DT_COLON));
strip(value, element_if(DT_WHITE));
strip(value, element_is_space{});
if (!value.empty()) {
if (value.size() == 2 && value.back().e_token == DNT_GROUP) {
element_list_t ELEMENT_LIST_T(group_pair);
@ -681,6 +683,7 @@ dfs_prefix_next(data_format_state_t state, data_token_t next_token)
case DT_HEX_NUMBER:
case DT_NUMBER:
case DT_WHITE:
case DT_CSI:
case DT_LSQUARE:
case DT_RSQUARE:
case DT_LANGLE:

View File

@ -77,7 +77,9 @@ struct data_format {
data_token_t terminator = DT_INVALID) noexcept
: df_name(name), df_appender(appender), df_terminator(terminator),
df_qualifier(DT_INVALID), df_separator(DT_COLON),
df_prefix_terminator(DT_INVALID){};
df_prefix_terminator(DT_INVALID)
{
}
const char* df_name;
data_token_t df_appender;
@ -234,7 +236,7 @@ public:
int group_depth = -1;
LIST_INIT_TRACE;
};
}
element_list_t(const element_list_t& other) : std::list<element>(other)
{
@ -247,7 +249,7 @@ public:
int line = __LINE__;
LIST_DEINIT_TRACE;
};
}
void push_front(const element& elem, const char* fn, int line)
{
@ -255,7 +257,7 @@ public:
require(elem.e_capture.c_end >= -1);
this->std::list<element>::push_front(elem);
};
}
void push_back(const element& elem, const char* fn, int line)
{
@ -263,28 +265,28 @@ public:
require(elem.e_capture.c_end >= -1);
this->std::list<element>::push_back(elem);
};
}
void pop_front(const char* fn, int line)
{
LIST_TRACE;
this->std::list<element>::pop_front();
};
}
void pop_back(const char* fn, int line)
{
LIST_TRACE;
this->std::list<element>::pop_back();
};
}
void clear2(const char* fn, int line)
{
LIST_TRACE;
this->std::list<element>::clear();
};
}
void swap(element_list_t& other, const char* fn, int line)
{
@ -345,26 +347,33 @@ public:
bool operator()(data_token_t token, const element& elem) const
{
return token == elem.e_token || token == DT_ANY;
};
}
bool operator()(const element& elem, data_token_t token) const
{
return (*this)(token, elem);
};
}
};
struct element_if {
element_if(data_token_t token) : ei_token(token){};
element_if(data_token_t token) : ei_token(token) {}
bool operator()(const element& a) const
{
return a.e_token == this->ei_token;
};
}
private:
data_token_t ei_token;
};
struct element_is_space {
bool operator()(const element& el) const
{
return el.e_token == DT_WHITE || el.e_token == DT_CSI;
}
};
struct discover_format_state {
discover_format_state();
@ -418,4 +427,5 @@ public:
private:
data_scanner* dp_scanner;
};
#endif

View File

@ -185,6 +185,9 @@ static struct {
{
"escc",
},
{
"csi",
},
{
"gbg",

View File

@ -93,6 +93,7 @@ enum data_token_t {
DT_WHITE,
DT_DOT,
DT_ESCAPED_CHAR,
DT_CSI,
DT_GARBAGE,

File diff suppressed because it is too large Load Diff

View File

@ -114,6 +114,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
SPACE = [ \t\r];
ALPHA = [a-zA-Z];
ESC = "\x1b";
NUM = [0-9];
ALPHANUM = [a-zA-Z0-9_];
EOF = "\x00";
@ -137,7 +138,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
EOF { return nonstd::nullopt; }
("u"|"r")?'"'('\\'.|[^\x00"\\]|'""')*'"' {
("u"|"r")?'"'('\\'.|[^\x00\x1b"\\]|'""')*'"' {
CAPTURE(DT_QUOTED_STRING);
switch (this->ds_input[cap_inner.c_begin]) {
case 'u':
@ -152,7 +153,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
[a-qstv-zA-QSTV-Z]"'" {
CAPTURE(DT_WORD);
}
("u"|"r")?"'"('\\'.|"''"|[^\x00'\\])*"'"/[^sS] {
("u"|"r")?"'"('\\'.|"''"|[^\x00\x1b'\\])*"'"/[^sS] {
CAPTURE(DT_QUOTED_STRING);
switch (this->ds_input[cap_inner.c_begin]) {
case 'u':
@ -164,7 +165,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
cap_inner.c_end -= 1;
return tokenize_result{token_out, cap_all, cap_inner, this->ds_input.data()};
}
[a-zA-Z0-9]+":/""/"?[^\x00\r\n\t '"[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); }
[a-zA-Z0-9]+":/""/"?[^\x00\x1b\r\n\t '"[\](){}]+[/a-zA-Z0-9\-=&?%] { RET(DT_URL); }
("/"|"./"|"../"|[A-Z]":\\"|"\\\\")("Program Files"(" (x86)")?)?[a-zA-Z0-9_\.\-\~/\\!@#$%^&*()]* { RET(DT_PATH); }
(SPACE|NUM)NUM":"NUM{2}/[^:] { RET(DT_TIME); }
(SPACE|NUM)NUM?":"NUM{2}":"NUM{2}("."NUM{3,6})?/[^:] { RET(DT_TIME); }
@ -200,6 +201,10 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
RET(DT_H1);
}
ESC"["[0-9=;?]*[a-zA-Z] {
RET(DT_CSI);
}
":" { RET(DT_COLON); }
"=" { RET(DT_EQUALS); }
"," { RET(DT_COMMA); }
@ -237,7 +242,7 @@ nonstd::optional<data_scanner::tokenize_result> data_scanner::tokenize2()
("re-")?[a-zA-Z][a-z']+/([\r\n\t \(\)!\*:;'\"\?,]|[\.\!,\?]SPACE|EOF) { RET(DT_WORD); }
[^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\.\\][^\x00"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]*("::"[^\x00"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* {
[^\x00\x1b"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\.\\][^\x00\x1b"; \t\r\n:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]*("::"[^\x00\x1b"; \r\n\t:=,\(\)\{\}\[\]\+#!%\^&\*'\?<>\~`\|\\]+)* {
RET(DT_SYMBOL);
}

View File

@ -651,21 +651,22 @@ line_buffer::load_next_buffer()
auto before = line_start - this->lb_alt_buffer->begin();
auto remaining = this->lb_alt_buffer.value().size() - before;
auto utf8_end = is_utf8((unsigned char*) line_start,
remaining,
&msg,
&faulty_bytes,
'\n');
auto utf_scan_res = is_utf8((unsigned char*) line_start,
remaining,
&msg,
&faulty_bytes,
'\n');
if (msg != nullptr) {
lf = (char*) memchr(line_start, '\n', remaining);
utf8_end = lf - line_start;
utf_scan_res.usr_end = lf - line_start;
valid_utf = false;
}
if (utf8_end >= 0) {
lf = line_start + utf8_end;
if (utf_scan_res.usr_end >= 0) {
lf = line_start + utf_scan_res.usr_end;
}
this->lb_alt_line_starts.emplace_back(before);
this->lb_alt_line_is_utf.emplace_back(valid_utf);
this->lb_alt_line_has_ansi.emplace_back(utf_scan_res.usr_has_ansi);
if (lf != nullptr) {
line_start = lf + 1;
@ -727,6 +728,8 @@ line_buffer::fill_range(file_off_t start, ssize_t max_length)
this->lb_alt_line_starts.clear();
this->lb_line_is_utf = std::move(this->lb_alt_line_is_utf);
this->lb_alt_line_is_utf.clear();
this->lb_line_has_ansi = std::move(this->lb_alt_line_has_ansi);
this->lb_alt_line_has_ansi.clear();
this->lb_stats.s_used_preloads += 1;
}
if (this->in_range(start) && this->in_range(start + max_length - 1)) {
@ -1045,17 +1048,20 @@ line_buffer::load_next_line(file_range prev_line)
const char* msg;
int faulty_bytes;
utf8_end = is_utf8((unsigned char*) line_start,
retval.li_file_range.fr_size,
&msg,
&faulty_bytes,
'\n');
auto scan_res = is_utf8((unsigned char*) line_start,
retval.li_file_range.fr_size,
&msg,
&faulty_bytes,
'\n');
if (msg != nullptr) {
lf = (char*) memchr(
line_start, '\n', retval.li_file_range.fr_size);
utf8_end = lf - line_start;
retval.li_valid_utf = false;
} else {
utf8_end = scan_res.usr_end;
}
retval.li_has_ansi = scan_res.usr_has_ansi;
}
if (utf8_end >= 0) {

View File

@ -54,6 +54,7 @@ struct line_info {
file_range li_file_range;
bool li_partial{false};
bool li_valid_utf{true};
bool li_has_ansi{false};
};
/**
@ -320,6 +321,7 @@ private:
nonstd::optional<auto_buffer> lb_alt_buffer;
std::vector<uint32_t> lb_alt_line_starts;
std::vector<bool> lb_alt_line_is_utf;
std::vector<bool> lb_alt_line_has_ansi;
std::future<bool> lb_loader_future;
nonstd::optional<file_off_t> lb_loader_file_offset;
@ -342,6 +344,7 @@ private:
std::vector<uint32_t> lb_line_starts;
std::vector<bool> lb_line_is_utf;
std::vector<bool> lb_line_has_ansi;
stats lb_stats;
nonstd::optional<auto_fd> lb_cached_fd;

View File

@ -129,7 +129,7 @@ log_data_helper::parse_line(content_line_t line, bool allow_middle)
pugi::xpath_query query("//*");
auto node_set = doc.select_nodes(query);
for (auto& xpath_node : node_set) {
for (const auto& xpath_node : node_set) {
auto node_path = lnav::pugixml::get_actual_path(
xpath_node.node());
for (auto& attr : xpath_node.node().attributes()) {
@ -175,10 +175,8 @@ log_data_helper::get_line_bounds(size_t& line_index_out,
line_end_index_out = 0;
do {
const char* line_end;
line_index_out = line_end_index_out;
line_end = (const char*) memchr(
const auto* line_end = (const char*) memchr(
this->ldh_line_values.lvv_sbr.get_data() + line_index_out + 1,
'\n',
this->ldh_line_values.lvv_sbr.length() - line_index_out - 1);

View File

@ -194,6 +194,10 @@ public:
bool is_valid_utf() const { return this->ll_valid_utf; }
void set_has_ansi(bool v) { this->ll_has_ansi = v; }
bool has_ansi() const { return this->ll_has_ansi; }
/** @param l The logging level. */
void set_level(log_level_t l) { this->ll_level = l; };
@ -293,7 +297,8 @@ public:
}
private:
file_off_t ll_offset;
file_off_t ll_offset : 63;
uint8_t ll_has_ansi : 1;
time_t ll_time;
unsigned int ll_millis : 10;
unsigned int ll_opid : 6;

View File

@ -42,6 +42,7 @@
#include <sys/stat.h>
#include <time.h>
#include "base/ansi_scrubber.hh"
#include "base/fs_util.hh"
#include "base/injector.hh"
#include "base/string_util.hh"
@ -309,6 +310,7 @@ logfile::process_prefix(shared_buffer_ref& sbr,
case log_format::SCAN_MATCH: {
if (!this->lf_index.empty()) {
this->lf_index.back().set_valid_utf(li.li_valid_utf);
this->lf_index.back().set_has_ansi(li.li_has_ansi);
}
if (prescan_size > 0 && this->lf_index.size() >= prescan_size
&& prescan_time != this->lf_index[prescan_size - 1].get_time())
@ -369,6 +371,7 @@ logfile::process_prefix(shared_buffer_ref& sbr,
last_mod,
last_opid);
this->lf_index.back().set_valid_utf(li.li_valid_utf);
this->lf_index.back().set_has_ansi(li.li_has_ansi);
break;
}
case log_format::SCAN_INCOMPLETE:
@ -582,6 +585,17 @@ logfile::rebuild_index(nonstd::optional<ui_clock::time_point> deadline)
auto sbr = read_result.unwrap();
sbr.rtrim(is_line_ending);
if (li.li_has_ansi) {
auto tmp_line = sbr.to_string_fragment().to_string();
scrub_ansi_string(tmp_line, nullptr);
memcpy(sbr.get_writable_data(),
tmp_line.c_str(),
tmp_line.length());
sbr.narrow(0, tmp_line.length());
}
this->lf_longest_line
= std::max(this->lf_longest_line, sbr.length());
this->lf_partial_line = li.li_partial;

View File

@ -197,6 +197,9 @@ logfile_sub_source::text_value_for_line(textview_curses& tc,
= this->lss_token_file->read_line(this->lss_token_line)
.map([](auto sbr) { return to_string(sbr); })
.unwrapOr({});
if (this->lss_token_line->has_ansi()) {
scrub_ansi_string(this->lss_token_value, &this->lss_token_attrs);
}
}
this->lss_token_shift_start = 0;
this->lss_token_shift_size = 0;

View File

@ -267,14 +267,14 @@ parse(const string_fragment& sf, event_handler& eh)
const char* utf8_errmsg = nullptr;
int utf8_faulty_bytes = 0;
auto utf8_erroff = is_utf8((unsigned char*) sf.data(),
sf.length(),
&utf8_errmsg,
&utf8_faulty_bytes);
auto scan_res = is_utf8((unsigned char*) sf.data(),
sf.length(),
&utf8_errmsg,
&utf8_faulty_bytes);
if (utf8_errmsg != nullptr) {
return Err(
fmt::format(FMT_STRING("file has invalid UTF-8 at offset {}: {}"),
utf8_erroff,
scan_res.usr_end,
utf8_errmsg));
}

View File

@ -31,6 +31,7 @@
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
#include "byte_array.hh"
#include "data_scanner.hh"
#include "doctest/doctest.h"
#include "lnav_config.hh"
#include "lnav_util.hh"
@ -148,10 +149,7 @@ class my_path_source : public unique_path_source {
public:
explicit my_path_source(ghc::filesystem::path p) : mps_path(std::move(p)) {}
ghc::filesystem::path get_path() const override
{
return this->mps_path;
}
ghc::filesystem::path get_path() const override { return this->mps_path; }
ghc::filesystem::path mps_path;
};
@ -214,3 +212,20 @@ TEST_CASE("user_message to json")
CHECK(json == json2);
}
TEST_CASE("data_scanner CSI")
{
static const char INPUT[] = "\x1b[32mHello\x1b[0m";
data_scanner ds(string_fragment::from_const(INPUT));
auto tok_res = ds.tokenize2();
CHECK(tok_res->tr_token == DT_CSI);
CHECK(tok_res->to_string() == "\x1b[32m");
tok_res = ds.tokenize2();
CHECK(tok_res->tr_token == DT_SYMBOL);
CHECK(tok_res->to_string() == "Hello");
tok_res = ds.tokenize2();
CHECK(tok_res->tr_token == DT_CSI);
CHECK(tok_res->to_string() == "\x1b[0m");
}