/** * Copyright (c) 2022, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "text_anonymizer.hh" #include #include #include "animals-json.h" #include "config.h" #include "data_scanner.hh" #include "diseases-json.h" #include "ghc/filesystem.hpp" #include "lnav_util.hh" #include "pcrepp/pcre2pp.hh" #include "words-json.h" #include "yajlpp/yajlpp_def.hh" namespace lnav { struct random_list { std::vector rl_data; std::string at_index(size_t index) const { auto counter = index / this->rl_data.size(); auto mod = index % this->rl_data.size(); auto retval = this->rl_data[mod]; if (counter > 0) { retval = fmt::format(FMT_STRING("{}{}"), retval, counter); } return retval; } }; static const typed_json_path_container random_list_handlers = { yajlpp::property_handler("data#").for_field(&random_list::rl_data), }; static random_list load_word_list() { static const intern_string_t name = intern_string::lookup(words_json.get_name()); auto parse_res = random_list_handlers.parser_for(name).with_ignore_unused(false).of( words_json.to_string_fragment()); return parse_res.unwrap(); } static const random_list& get_word_list() { static const auto retval = load_word_list(); return retval; } static random_list load_animal_list() { static const intern_string_t name = intern_string::lookup(animals_json.get_name()); auto parse_res = random_list_handlers.parser_for(name).with_ignore_unused(false).of( animals_json.to_string_fragment()); return parse_res.unwrap(); } static const random_list& get_animal_list() { static const auto retval = load_animal_list(); return retval; } static random_list load_disease_list() { static const intern_string_t name = intern_string::lookup(diseases_json.get_name()); auto parse_res = random_list_handlers.parser_for(name).with_ignore_unused(false).of( diseases_json.to_string_fragment()); return parse_res.unwrap(); } static const random_list& get_disease_list() { static const auto retval = load_disease_list(); return retval; } std::string text_anonymizer::next(string_fragment line) { data_scanner ds(line); std::string retval; while (true) { auto tok_res = ds.tokenize2(); if (!tok_res) { break; } switch (tok_res->tr_token) { case DT_URL: { auto url_str = tok_res->to_string(); auto_mem cu(curl_url_cleanup); cu = curl_url(); if (curl_url_set(cu, CURLUPART_URL, url_str.c_str(), 0) != CURLUE_OK) { retval += ""; } else { auto_mem url_part(curl_free); if (curl_url_get( cu, CURLUPART_USER, url_part.out(), CURLU_URLDECODE) == CURLUE_OK) { auto anon_user = this->get_default( this->ta_user_names, url_part.in(), [](size_t size, auto& user) { return get_animal_list().at_index(size); }); curl_url_set(cu, CURLUPART_USER, anon_user.c_str(), CURLU_URLENCODE); } if (curl_url_get(cu, CURLUPART_PASSWORD, url_part.out(), CURLU_URLDECODE) == CURLUE_OK) { auto anon_pass = hasher() .update(url_part.in(), strlen(url_part.in())) .to_string(); curl_url_set(cu, CURLUPART_PASSWORD, anon_pass.c_str(), CURLU_URLENCODE); } if (curl_url_get( cu, CURLUPART_HOST, url_part.out(), CURLU_URLDECODE) == CURLUE_OK) { auto anon_host = this->get_default( this->ta_host_names, url_part.in(), [](size_t size, auto& hn) { const auto& diseases = get_disease_list(); return fmt::format(FMT_STRING("{}.example.com"), diseases.at_index(size)); }); curl_url_set(cu, CURLUPART_HOST, anon_host.c_str(), CURLU_URLENCODE); } if (curl_url_get( cu, CURLUPART_PATH, url_part.out(), CURLU_URLDECODE) == CURLUE_OK) { ghc::filesystem::path url_path(url_part.in()); ghc::filesystem::path anon_path; for (const auto& comp : url_path) { if (comp == comp.root_path()) { anon_path = anon_path / comp; continue; } anon_path = anon_path / this->next(comp.string()); } curl_url_set(cu, CURLUPART_PATH, anon_path.c_str(), CURLU_URLENCODE); } if (curl_url_get(cu, CURLUPART_QUERY, url_part.out(), CURLU_URLDECODE) == CURLUE_OK) { static const auto SPLIT_RE = lnav::pcre2pp::code::from_const(R"((&))"); curl_url_set(cu, CURLUPART_QUERY, nullptr, 0); auto url_query = string_fragment::from_c_str(url_part.in()); auto replacer = [this, &cu](const std::string& comp) { std::string anon_query; auto eq_index = comp.find('='); if (eq_index != std::string::npos) { auto new_key = this->next(comp.substr(0, eq_index)); auto new_value = this->next(comp.substr(eq_index + 1)); anon_query = fmt::format( FMT_STRING("{}={}"), new_key, new_value); } else { anon_query = this->next(comp); } curl_url_set(cu, CURLUPART_QUERY, anon_query.c_str(), CURLU_URLENCODE | CURLU_APPENDQUERY); }; auto loop_res = SPLIT_RE.capture_from(url_query).for_each( [&replacer](lnav::pcre2pp::match_data& md) { replacer(md.leading().to_string()); }); if (loop_res.isOk()) { replacer(loop_res.unwrap().to_string()); } } if (curl_url_get(cu, CURLUPART_FRAGMENT, url_part.out(), CURLU_URLDECODE) == CURLUE_OK) { auto anon_frag = this->next( string_fragment::from_c_str(url_part.in())); curl_url_set(cu, CURLUPART_FRAGMENT, anon_frag.c_str(), CURLU_URLENCODE); } auto_mem anon_url(curl_free); if (curl_url_get(cu, CURLUPART_URL, anon_url.out(), 0) == CURLUE_OK) { retval.append(anon_url.in()); } } break; } case DT_PATH: { ghc::filesystem::path inp_path(tok_res->to_string()); ghc::filesystem::path anon_path; for (const auto& comp : inp_path) { auto comp_str = comp.string(); if (comp == comp.root_path() || comp == inp_path) { anon_path = anon_path / comp; continue; } anon_path = anon_path / this->next(comp_str); } retval += anon_path.string(); break; } case DT_CREDIT_CARD_NUMBER: { auto cc = tok_res->to_string(); auto has_spaces = cc.size() > 16; auto new_end = std::remove_if( cc.begin(), cc.end(), [](auto ch) { return ch == ' '; }); cc.erase(new_end, cc.end()); auto anon_cc = hasher().update(cc).to_string().substr(0, 16); if (has_spaces) { anon_cc.insert(12, " "); anon_cc.insert(8, " "); anon_cc.insert(4, " "); } retval += anon_cc; break; } case DT_MAC_ADDRESS: { // 00-00-5E-00-53-00 auto mac_addr = tok_res->to_string(); retval += this->get_default( this->ta_mac_addresses, mac_addr, [](size_t size, auto& inp) { uint32_t base_mac = 0x5e005300; base_mac += size; auto anon_mac = byte_array<6>::from({ 0x00, 0x00, (unsigned char) ((base_mac >> 24) & 0xff), (unsigned char) ((base_mac >> 16) & 0xff), (unsigned char) ((base_mac >> 8) & 0xff), (unsigned char) ((base_mac >> 0) & 0xff), }); return anon_mac.to_string( nonstd::make_optional(inp[2])); }); break; } case DT_HEX_DUMP: { auto hex_str = tok_res->to_string(); auto hash_str = hasher().update(hex_str).to_array().to_string( nonstd::make_optional(hex_str[2])); std::string anon_hex; while (anon_hex.size() < hex_str.size()) { anon_hex += hash_str; } anon_hex.resize(hex_str.size()); retval += anon_hex; break; } case DT_IPV4_ADDRESS: { auto ipv4 = tok_res->to_string(); retval += this->get_default( this->ta_ipv4_addresses, ipv4, [](size_t size, auto& _) { char anon_ipv4[INET_ADDRSTRLEN]; struct in_addr ia; inet_aton("10.0.0.0", &ia); ia.s_addr = htonl(ntohl(ia.s_addr) + 1 + size); inet_ntop(AF_INET, &ia, anon_ipv4, sizeof(anon_ipv4)); return std::string{anon_ipv4}; }); break; } case DT_IPV6_ADDRESS: { auto ipv6 = tok_res->to_string(); retval += this->get_default( this->ta_ipv6_addresses, ipv6, [](size_t size, auto& _) { char anon_ipv6[INET6_ADDRSTRLEN]; struct in6_addr ia; uint32_t* ia6_addr32 = (uint32_t*) &ia.s6_addr[12]; inet_pton(AF_INET6, "2001:db8::", &ia); *ia6_addr32 = htonl(ntohl(*ia6_addr32) + 1 + size); inet_ntop(AF_INET6, &ia, anon_ipv6, sizeof(anon_ipv6)); return std::string{anon_ipv6}; }); break; } case DT_EMAIL: { auto email_addr = tok_res->to_string(); auto at_index = email_addr.find('@'); retval += fmt::format( FMT_STRING("{}@{}.example.com"), this->get_default(this->ta_user_names, email_addr.substr(0, at_index), [](auto size, const auto& inp) { return get_animal_list().at_index( size); }), this->get_default(this->ta_host_names, email_addr.substr(at_index + 1), [](auto size, const auto& inp) { return get_disease_list().at_index( size); })); break; } case DT_WORD: case DT_SYMBOL: { static const auto SPLIT_RE = lnav::pcre2pp::code::from_const( R"((\.|::|_|-|/|\\|\d+))"); auto symbol_frag = ds.to_string_fragment(tok_res->tr_capture); auto sym_provider = [](auto size, const auto& inp) { if (inp.size() <= 4) { return inp; } auto comp_frag = string_fragment::from_str(inp); return string_fragment::from_str( get_word_list().at_index(size)) .to_string_with_case_style( comp_frag.detect_text_case_style()); }; auto cap_res = SPLIT_RE.capture_from(symbol_frag) .for_each([this, &retval, &sym_provider]( lnav::pcre2pp::match_data& md) { auto comp = md.leading().to_string(); retval += this->get_default( this->ta_symbols, comp, sym_provider) + md[0]->to_string(); }); if (cap_res.isErr()) { retval += ""; } else { auto remaining = cap_res.unwrap().to_string(); retval += this->get_default( this->ta_symbols, remaining, sym_provider); } break; } case DT_QUOTED_STRING: { auto anon_inner = this->next( ds.to_string_fragment(tok_res->tr_inner_capture) .to_string()); retval += line.sub_range(tok_res->tr_capture.c_begin, tok_res->tr_inner_capture.c_begin) .to_string() + anon_inner + ds.to_string_fragment(tok_res->tr_capture).back(); break; } case DT_XML_OPEN_TAG: { auto open_tag = tok_res->to_string(); auto space_index = open_tag.find(' '); if (space_index == std::string::npos) { retval += open_tag; } else { static const auto ATTR_RE = lnav::pcre2pp::code::from_const(R"([\w\-]+=)"); static thread_local auto md = lnav::pcre2pp::match_data::unitialized(); auto remaining = string_fragment::from_str_range( open_tag, space_index, open_tag.size()); retval += open_tag.substr(0, space_index + 1); while (!remaining.empty()) { auto cap_res = ATTR_RE.capture_from(remaining) .into(md) .matches() .ignore_error(); if (!cap_res) { break; } retval += md.leading(); retval += md[0]->to_string(); remaining = md.remaining(); data_scanner ds(remaining); auto attr_tok_res = ds.tokenize2(); if (!attr_tok_res) { continue; } retval += this->next(attr_tok_res->to_string()); remaining = remaining.substr( attr_tok_res->tr_capture.length()); } retval += remaining.to_string(); } break; } case DT_UUID: { retval += hasher().update(tok_res->to_string()).to_uuid_string(); break; } default: { retval += tok_res->to_string(); break; } } } return retval; } } // namespace lnav