use html5ever::interface::QualName; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::{format_tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns}; use http::retrieve_asset; use js::attr_is_event_handler; use std::collections::HashMap; use std::default::Default; use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol}; lazy_static! { static ref EMPTY_STRING: String = String::new(); } const ICON_VALUES: [&str; 5] = [ "icon", "shortcut icon", "mask-icon", "apple-touch-icon", "fluid-icon", ]; const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; pub fn get_parent_node(node: &Handle) -> Handle { let parent = node.parent.take().clone(); parent.and_then(|node| node.upgrade()).unwrap() } pub fn get_node_name(node: &Handle) -> String { match &node.data { NodeData::Element { ref name, .. } => name.local.as_ref().to_string(), _ => EMPTY_STRING.clone(), } } pub fn is_icon(attr_value: &str) -> bool { ICON_VALUES.contains(&&*attr_value.to_lowercase()) } pub fn walk_and_embed_assets( cache: &mut HashMap, url: &str, node: &Handle, opt_no_css: bool, opt_no_js: bool, opt_no_images: bool, opt_user_agent: &str, opt_silent: bool, opt_insecure: bool, opt_no_frames: bool, ) { match node.data { NodeData::Document => { // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( cache, &url, child, opt_no_css, opt_no_js, opt_no_images, opt_user_agent, opt_silent, opt_insecure, opt_no_frames, ); } } NodeData::Element { ref name, ref attrs, .. } => { let attrs_mut = &mut attrs.borrow_mut(); match name.local.as_ref() { "link" => { let mut link_type: &str = ""; for attr in attrs_mut.iter_mut() { if &attr.name.local == "rel" { if is_icon(&attr.value.to_string()) { link_type = "icon"; break; } else if attr.value.to_string() == "stylesheet" { link_type = "stylesheet"; break; } } } if link_type == "icon" { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { if opt_no_images { attr.value.clear(); } else { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (favicon_dataurl, _) = retrieve_asset( cache, &href_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); attr.value.clear(); attr.value.push_slice(favicon_dataurl.as_str()); } } } } else if link_type == "stylesheet" { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { if opt_no_css { attr.value.clear(); } else { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let replacement_text = match retrieve_asset( cache, &href_full_url, false, "text/css", opt_user_agent, opt_silent, opt_insecure, ) { // On successful retrieval, traverse CSS Ok((css_data, _)) => resolve_css_imports( cache, &css_data, true, &href_full_url, opt_no_images, opt_user_agent, opt_silent, opt_insecure, ), // If a network error occured, warn Err(e) => { eprintln!("Warning: {}", e,); // If failed to resolve, replace with absolute URL href_full_url } }; attr.value.clear(); attr.value.push_slice(&replacement_text); } } } } else { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(&href_full_url.as_str()); } } } } "img" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { let value = attr.value.to_string(); // Ignore images with empty source if value == EMPTY_STRING.clone() { continue; } if opt_no_images { attr.value.clear(); attr.value.push_slice(TRANSPARENT_PIXEL); } else { let src_full_url: String = resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone()); let (img_dataurl, _) = retrieve_asset( cache, &src_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); attr.value.clear(); attr.value.push_slice(img_dataurl.as_str()); } } } } "source" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name == "src" { let src_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(attr.value.to_string()); attr.value.clear(); attr.value.push_slice(src_full_url.as_str()); } else if attr_name == "srcset" { if get_node_name(&get_parent_node(&node)) == "picture" { if opt_no_images { attr.value.clear(); attr.value.push_slice(TRANSPARENT_PIXEL); } else { let srcset_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (source_dataurl, _) = retrieve_asset( cache, &srcset_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); attr.value.clear(); attr.value.push_slice(source_dataurl.as_str()); } } } } } "a" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { // Don't touch email links or hrefs which begin with a hash sign if attr.value.starts_with('#') || url_has_protocol(&attr.value) { continue; } let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(href_full_url.as_str()); } } } "script" => { if opt_no_js { // Empty src and inner content of SCRIPT tags for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { attr.value.clear(); } } node.children.borrow_mut().clear(); } else { for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { let src_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let (js_dataurl, _) = retrieve_asset( cache, &src_full_url, true, "application/javascript", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone())); attr.value.clear(); attr.value.push_slice(js_dataurl.as_str()); } } } } "style" => { if opt_no_css { // Empty inner content of STYLE tags node.children.borrow_mut().clear(); } else { for node in node.children.borrow_mut().iter_mut() { if let NodeData::Text { ref contents } = node.data { let mut tendril = contents.borrow_mut(); let replacement = resolve_css_imports( cache, tendril.as_ref(), false, &url, opt_no_images, opt_user_agent, opt_silent, opt_insecure, ); tendril.clear(); tendril.push_slice(&replacement); } } } } "form" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "action" { // Modify action to be a full URL if !is_valid_url(&attr.value) { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(href_full_url.as_str()); } } } } "iframe" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { if opt_no_frames { // Empty the src attribute attr.value.clear(); continue; } let iframe_src: String = attr.value.to_string(); // Ignore iframes with empty source (they cause infinite loops) if iframe_src == EMPTY_STRING.clone() { continue; } let src_full_url: String = resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); let (iframe_data, iframe_final_url) = retrieve_asset( cache, &src_full_url, false, "text/html", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or((EMPTY_STRING.clone(), src_full_url)); let dom = html_to_dom(&iframe_data); walk_and_embed_assets( cache, &iframe_final_url, &dom.document, opt_no_css, opt_no_js, opt_no_images, opt_user_agent, opt_silent, opt_insecure, opt_no_frames, ); let mut buf: Vec = Vec::new(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); let iframe_dataurl = data_to_dataurl("text/html", &buf); attr.value.clear(); attr.value.push_slice(iframe_dataurl.as_str()); } } } "video" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "poster" { let video_poster = attr.value.to_string(); // Skip posters with empty source if video_poster == EMPTY_STRING.clone() { continue; } if opt_no_images { attr.value.clear(); } else { let poster_full_url: String = resolve_url(&url, &video_poster) .unwrap_or(EMPTY_STRING.clone()); let (poster_dataurl, _) = retrieve_asset( cache, &poster_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or((poster_full_url, EMPTY_STRING.clone())); attr.value.clear(); attr.value.push_slice(poster_dataurl.as_str()); } } } } _ => {} } // Process style attributes if opt_no_css { // Get rid of style attributes let mut style_attr_indexes = Vec::new(); for (i, attr) in attrs_mut.iter_mut().enumerate() { if attr.name.local.to_lowercase() == "style" { style_attr_indexes.push(i); } } style_attr_indexes.reverse(); for attr_index in style_attr_indexes { attrs_mut.remove(attr_index); } } else { // Otherwise, parse any links found in the attributes for attribute in attrs_mut .iter_mut() .filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style")) { let replacement = resolve_css_imports( cache, attribute.value.as_ref(), false, &url, opt_no_images, opt_user_agent, opt_silent, opt_insecure, ); attribute.value.clear(); attribute.value.push_slice(&replacement); } } if opt_no_js { // Get rid of JS event attributes let mut js_attr_indexes = Vec::new(); for (i, attr) in attrs_mut.iter_mut().enumerate() { if attr_is_event_handler(&attr.name.local) { js_attr_indexes.push(i); } } js_attr_indexes.reverse(); for attr_index in js_attr_indexes { attrs_mut.remove(attr_index); } } // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( cache, &url, child, opt_no_css, opt_no_js, opt_no_images, opt_user_agent, opt_silent, opt_insecure, opt_no_frames, ); } } _ => { // Note: in case of opt_no_js being set to true, there's no need to worry about // getting rid of comments that may contain scripts, e.g.