ferron/util/
url_sanitizer.rs

1// Copyright (c) 2018-2025 SVR.JS
2// Portions of this file are derived from SVR.JS (https://github.com/svr-js/svrjs).
3//
4// Permission is hereby granted, free of charge, to any person obtaining a copy
5// of this software and associated documentation files (the "Software"), to deal
6// in the Software without restriction, including without limitation the rights
7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8// copies of the Software, and to permit persons to whom the Software is
9// furnished to do so, subject to the following conditions:
10//
11// The above copyright notice and this permission notice shall be included in all
12// copies or substantial portions of the Software.
13//
14use anyhow::{anyhow, Result};
15use smallvec::SmallVec;
16
17// Lookup table for safe characters that don't need encoding
18static SAFE_CHARS: [bool; 256] = {
19  let mut table = [false; 256];
20  let safe_bytes = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!$&'()*+,-./:;=@[]_~";
21  let mut i = 0;
22  while i < safe_bytes.len() {
23    table[safe_bytes[i] as usize] = true;
24    i += 1;
25  }
26  table
27};
28
29// Hex lookup table for faster encoding
30static HEX_CHARS: [u8; 16] = *b"0123456789ABCDEF";
31
32/// Sanitizes the URL
33pub fn sanitize_url(resource: &str, allow_double_slashes: bool) -> Result<String> {
34  if resource == "*" || resource.is_empty() {
35    return Ok(resource.to_string());
36  }
37
38  let bytes = resource.as_bytes();
39  let mut result = SmallVec::<[u8; 256]>::with_capacity(bytes.len() * 2);
40
41  // Combined pass: remove nulls, validate percent encoding, decode/encode in one go
42  let mut i = 0;
43  while i < bytes.len() {
44    let byte = bytes[i];
45
46    // Skip null bytes
47    if byte == 0 {
48      i += 1;
49      continue;
50    }
51
52    if byte == b'%' {
53      // Validate percent encoding
54      if i + 2 >= bytes.len() {
55        return Err(anyhow!("URI malformed"));
56      }
57
58      let hi = bytes[i + 1];
59      let lo = bytes[i + 2];
60
61      if !hi.is_ascii_hexdigit() || !lo.is_ascii_hexdigit() {
62        return Err(anyhow!("URI malformed"));
63      }
64
65      let value = hex_to_byte_fast(hi, lo)?;
66      if value == 0xc0 || value == 0xc1 || value >= 0xfe {
67        return Err(anyhow!("URI malformed"));
68      }
69
70      // Decode if safe, otherwise keep encoded
71      if value == 0 {
72        // Skip null bytes even when percent-encoded
73        i += 3;
74        continue;
75      } else if SAFE_CHARS[value as usize] {
76        result.push(value);
77      } else {
78        result.push(b'%');
79        result.push(hi);
80        result.push(lo);
81      }
82      i += 3;
83    } else {
84      // Handle special characters that need encoding
85      match byte {
86        b'<' | b'>' | b'^' | b'`' | b'{' | b'|' | b'}' => {
87          result.push(b'%');
88          result.push(HEX_CHARS[(byte >> 4) as usize]);
89          result.push(HEX_CHARS[(byte & 0xF) as usize]);
90        }
91        _ => result.push(byte),
92      }
93      i += 1;
94    }
95  }
96
97  // Ensure starts with '/' - but handle double slashes properly
98  if result.is_empty() || result[0] != b'/' {
99    result.insert(0, b'/');
100  } else if allow_double_slashes && bytes.len() >= 2 && bytes[0] == b'/' && bytes[1] == b'/' {
101    // If the original input started with // and we allow double slashes,
102    // ensure we preserve that (the processing above might have normalized it)
103    if result.len() >= 2 && result[0] == b'/' && result[1] != b'/' {
104      result.insert(1, b'/');
105    }
106  }
107
108  // Normalize slashes and build segments in one pass
109  let mut segments = SmallVec::<[SmallVec<[u8; 32]>; 16]>::new();
110  let mut current_segment = SmallVec::<[u8; 32]>::new();
111  let mut last_was_slash = true; // Start with true since we ensured it starts with '/'
112
113  i = if allow_double_slashes && result.len() >= 2 && result[0] == b'/' && result[1] == b'/' {
114    2 // Skip both leading slashes for UNC paths
115  } else {
116    1 // Skip single leading slash
117  };
118
119  while i < result.len() {
120    let byte = result[i];
121
122    if byte == b'\\' || byte == b'/' {
123      if !current_segment.is_empty() {
124        // Trim trailing dots, but preserve "." and ".." for navigation
125        if current_segment.as_slice() != b"." && current_segment.as_slice() != b".." {
126          while let Some(&b'.') = current_segment.last() {
127            current_segment.pop();
128          }
129        }
130
131        if !current_segment.is_empty() {
132          segments.push(current_segment);
133          current_segment = SmallVec::new();
134        }
135      }
136
137      if allow_double_slashes && last_was_slash {
138        // Add empty segment for double slash
139        segments.push(SmallVec::new());
140      }
141      last_was_slash = true;
142    } else {
143      current_segment.push(byte);
144      last_was_slash = false;
145    }
146    i += 1;
147  }
148
149  // Handle final segment
150  if !current_segment.is_empty() {
151    // Trim trailing dots, but preserve "." and ".." for navigation
152    if current_segment.as_slice() != b"." && current_segment.as_slice() != b".." {
153      while let Some(&b'.') = current_segment.last() {
154        current_segment.pop();
155      }
156    }
157    if !current_segment.is_empty() {
158      segments.push(current_segment);
159    }
160  }
161
162  // Process segments for . and .. navigation
163  let mut final_segments = SmallVec::<[SmallVec<[u8; 32]>; 16]>::new();
164  for segment in segments {
165    if segment.is_empty() && allow_double_slashes {
166      final_segments.push(segment);
167    } else if segment.as_slice() == b"." {
168      // Skip current directory
169      continue;
170    } else if segment.as_slice() == b".." {
171      // Parent directory - remove last segment
172      final_segments.pop();
173    } else if !segment.is_empty() {
174      final_segments.push(segment);
175    }
176  }
177
178  // Build final result
179  let mut final_result = SmallVec::<[u8; 256]>::with_capacity(result.len());
180
181  // Handle UNC paths (//server/share) when double slashes are allowed
182  if allow_double_slashes && bytes.len() >= 2 && bytes[0] == b'/' && bytes[1] == b'/' {
183    final_result.push(b'/');
184    final_result.push(b'/');
185  } else {
186    final_result.push(b'/');
187  }
188
189  let preserve_trailing_slash =
190    !result.is_empty() && (result[result.len() - 1] == b'/' || result[result.len() - 1] == b'\\');
191
192  for (idx, segment) in final_segments.iter().enumerate() {
193    if idx > 0 || (allow_double_slashes && segment.is_empty()) {
194      final_result.push(b'/');
195    }
196    final_result.extend_from_slice(segment);
197  }
198
199  // Add trailing slash if it was originally present and we have segments,
200  // but don't add it if we're just dealing with the root "/"
201  if preserve_trailing_slash && !final_segments.is_empty() {
202    final_result.push(b'/');
203  }
204
205  // Convert to string
206  String::from_utf8(final_result.into_vec()).map_err(|_| anyhow!("Invalid UTF-8 in result"))
207}
208
209#[inline(always)]
210fn hex_to_byte_fast(hi: u8, lo: u8) -> Result<u8> {
211  #[inline(always)]
212  fn hex_val(c: u8) -> Option<u8> {
213    match c {
214      b'0'..=b'9' => Some(c - b'0'),
215      b'a'..=b'f' => Some(10 + (c - b'a')),
216      b'A'..=b'F' => Some(10 + (c - b'A')),
217      _ => None,
218    }
219  }
220  match (hex_val(hi), hex_val(lo)) {
221    (Some(h), Some(l)) => Ok(h << 4 | l),
222    _ => Err(anyhow!("Invalid hex")),
223  }
224}
225
226// Path sanitizer tests taken from SVR.JS web server
227#[cfg(test)]
228mod tests {
229  use super::*;
230  use anyhow::Result;
231
232  #[test]
233  fn should_not_change_slash() -> Result<()> {
234    assert_eq!(sanitize_url("/", false)?, "/");
235    Ok(())
236  }
237
238  #[test]
239  fn should_return_asterisk_for_asterisk() -> Result<()> {
240    assert_eq!(sanitize_url("*", false)?, "*");
241    Ok(())
242  }
243
244  #[test]
245  fn should_return_empty_string_for_empty_string() -> Result<()> {
246    assert_eq!(sanitize_url("", false)?, "");
247    Ok(())
248  }
249
250  #[test]
251  fn should_remove_null_characters() -> Result<()> {
252    assert_eq!(sanitize_url("/test%00", false)?, "/test");
253    assert_eq!(sanitize_url("/test\0", false)?, "/test");
254    Ok(())
255  }
256
257  #[test]
258  fn should_throw_uri_error_for_malformed_url() {
259    assert!(sanitize_url("%c0%af", false).is_err());
260    assert!(sanitize_url("%u002f", false).is_err());
261    assert!(sanitize_url("%as", false).is_err());
262  }
263
264  #[test]
265  fn should_ensure_the_resource_starts_with_a_slash() -> Result<()> {
266    assert_eq!(sanitize_url("test", false)?, "/test");
267    Ok(())
268  }
269
270  #[test]
271  fn should_convert_backslashes_to_slashes() -> Result<()> {
272    assert_eq!(sanitize_url("test\\path", false)?, "/test/path");
273    Ok(())
274  }
275
276  #[test]
277  fn should_handle_duplicate_slashes() -> Result<()> {
278    assert_eq!(sanitize_url("test//path", false)?, "/test/path");
279    assert_eq!(sanitize_url("test//path", true)?, "/test//path");
280    Ok(())
281  }
282
283  #[test]
284  fn should_handle_relative_navigation() -> Result<()> {
285    assert_eq!(sanitize_url("/./test", false)?, "/test");
286    assert_eq!(sanitize_url("/../test", false)?, "/test");
287    assert_eq!(sanitize_url("../test", false)?, "/test");
288    assert_eq!(sanitize_url("./test", false)?, "/test");
289    assert_eq!(sanitize_url("/test/./", false)?, "/test/");
290    assert_eq!(sanitize_url("/test/../", false)?, "/");
291    assert_eq!(sanitize_url("/test/../path", false)?, "/path");
292    Ok(())
293  }
294
295  #[test]
296  fn should_remove_trailing_dots_in_paths() -> Result<()> {
297    assert_eq!(sanitize_url("/test...", false)?, "/test");
298    assert_eq!(sanitize_url("/test.../", false)?, "/test/");
299    Ok(())
300  }
301
302  #[test]
303  fn should_return_slash_for_empty_sanitized_resource() -> Result<()> {
304    assert_eq!(sanitize_url("/../..", false)?, "/");
305    Ok(())
306  }
307
308  #[test]
309  fn should_encode_special_characters() -> Result<()> {
310    assert_eq!(sanitize_url("/test<path>", false)?, "/test%3Cpath%3E");
311    assert_eq!(sanitize_url("/test^path", false)?, "/test%5Epath");
312    assert_eq!(sanitize_url("/test`path", false)?, "/test%60path");
313    assert_eq!(sanitize_url("/test{path}", false)?, "/test%7Bpath%7D");
314    assert_eq!(sanitize_url("/test|path", false)?, "/test%7Cpath");
315    Ok(())
316  }
317
318  #[test]
319  fn should_preserve_certain_characters() -> Result<()> {
320    assert_eq!(sanitize_url("/test!path", false)?, "/test!path");
321    assert_eq!(sanitize_url("/test$path", false)?, "/test$path");
322    assert_eq!(sanitize_url("/test&path", false)?, "/test&path");
323    assert_eq!(sanitize_url("/test-path", false)?, "/test-path");
324    assert_eq!(sanitize_url("/test=path", false)?, "/test=path");
325    assert_eq!(sanitize_url("/test@path", false)?, "/test@path");
326    assert_eq!(sanitize_url("/test_path", false)?, "/test_path");
327    assert_eq!(sanitize_url("/test~path", false)?, "/test~path");
328    Ok(())
329  }
330
331  #[test]
332  fn should_decode_url_encoded_characters_while_preserving_certain_characters() -> Result<()> {
333    assert_eq!(sanitize_url("/test%20path", false)?, "/test%20path");
334    assert_eq!(sanitize_url("/test%21path", false)?, "/test!path");
335    assert_eq!(sanitize_url("/test%22path", false)?, "/test%22path");
336    assert_eq!(sanitize_url("/test%24path", false)?, "/test$path");
337    assert_eq!(sanitize_url("/test%25path", false)?, "/test%25path");
338    assert_eq!(sanitize_url("/test%26path", false)?, "/test&path");
339    assert_eq!(sanitize_url("/test%2Dpath", false)?, "/test-path");
340    assert_eq!(sanitize_url("/test%3Cpath", false)?, "/test%3Cpath");
341    assert_eq!(sanitize_url("/test%3Dpath", false)?, "/test=path");
342    assert_eq!(sanitize_url("/test%3Epath", false)?, "/test%3Epath");
343    assert_eq!(sanitize_url("/test%40path", false)?, "/test@path");
344    assert_eq!(sanitize_url("/test%5Fpath", false)?, "/test_path");
345    assert_eq!(sanitize_url("/test%7Dpath", false)?, "/test%7Dpath");
346    assert_eq!(sanitize_url("/test%7Epath", false)?, "/test~path");
347    Ok(())
348  }
349
350  #[test]
351  fn should_decode_url_encoded_alphanumeric_characters_while_preserving_certain_characters() -> Result<()> {
352    assert_eq!(sanitize_url("/conf%69g.json", false)?, "/config.json");
353    assert_eq!(sanitize_url("/CONF%49G.JSON", false)?, "/CONFIG.JSON");
354    assert_eq!(sanitize_url("/svr%32.js", false)?, "/svr2.js");
355    assert_eq!(sanitize_url("/%73%76%72%32%2E%6A%73", false)?, "/svr2.js");
356    Ok(())
357  }
358
359  #[test]
360  fn should_decode_url_encoded_characters_regardless_of_the_letter_case_of_the_url_encoding() -> Result<()> {
361    assert_eq!(sanitize_url("/%5f", false)?, "/_");
362    assert_eq!(sanitize_url("/%5F", false)?, "/_");
363    Ok(())
364  }
365
366  // Edge cases for percent encoding
367  #[test]
368  fn should_handle_incomplete_percent_encoding() {
369    assert!(sanitize_url("/test%2", false).is_err());
370    assert!(sanitize_url("/test%", false).is_err());
371    assert!(sanitize_url("/test%G", false).is_err());
372    assert!(sanitize_url("/test%2G", false).is_err());
373  }
374
375  #[test]
376  fn should_handle_invalid_utf8_bytes() {
377    // These are invalid UTF-8 sequences that should be rejected
378    assert!(sanitize_url("/test%C0%80", false).is_err()); // Overlong encoding
379    assert!(sanitize_url("/test%C1%BF", false).is_err()); // Overlong encoding
380    assert!(sanitize_url("/test%FE%FF", false).is_err()); // Invalid bytes
381    assert!(sanitize_url("/test%FF%FE", false).is_err()); // Invalid bytes
382  }
383
384  // Complex path navigation tests
385  #[test]
386  fn should_handle_complex_relative_paths() -> Result<()> {
387    assert_eq!(sanitize_url("/a/b/c/../../d", false)?, "/a/d");
388    assert_eq!(sanitize_url("/a/./b/../c/./d/..", false)?, "/a/c");
389    assert_eq!(sanitize_url("/a/b/c/../../../..", false)?, "/");
390    assert_eq!(sanitize_url("./../../a/b/c", false)?, "/a/b/c");
391    Ok(())
392  }
393
394  #[test]
395  fn should_handle_dots_in_filenames() -> Result<()> {
396    assert_eq!(sanitize_url("/file.txt", false)?, "/file.txt");
397    assert_eq!(sanitize_url("/file.backup.txt", false)?, "/file.backup.txt");
398    assert_eq!(sanitize_url("/hidden/.config", false)?, "/hidden/.config");
399    assert_eq!(sanitize_url("/.htaccess", false)?, "/.htaccess");
400    Ok(())
401  }
402
403  #[test]
404  fn should_preserve_double_slashes_when_allowed() -> Result<()> {
405    assert_eq!(sanitize_url("//server/share", true)?, "//server/share");
406    assert_eq!(sanitize_url("/path//to///file", true)?, "/path//to///file");
407    assert_eq!(sanitize_url("test///path", true)?, "/test///path");
408    Ok(())
409  }
410
411  // Trailing slash preservation tests
412  #[test]
413  fn should_handle_trailing_slashes_correctly() -> Result<()> {
414    assert_eq!(sanitize_url("/test/", false)?, "/test/");
415    assert_eq!(sanitize_url("/test/path/", false)?, "/test/path/");
416    assert_eq!(sanitize_url("/test\\", false)?, "/test/");
417    assert_eq!(sanitize_url("/test/./", false)?, "/test/");
418    assert_eq!(sanitize_url("/test/../", false)?, "/");
419    Ok(())
420  }
421
422  #[test]
423  fn should_not_add_trailing_slash_to_root() -> Result<()> {
424    assert_eq!(sanitize_url("/", false)?, "/");
425    assert_eq!(sanitize_url("//", false)?, "/");
426    assert_eq!(sanitize_url("///", false)?, "/");
427    Ok(())
428  }
429
430  // Unicode and special character tests
431  #[test]
432  fn should_handle_unicode_characters() -> Result<()> {
433    assert_eq!(sanitize_url("/测试", false)?, "/测试");
434    assert_eq!(sanitize_url("/тест", false)?, "/тест");
435    assert_eq!(sanitize_url("/🚀", false)?, "/🚀");
436    assert_eq!(sanitize_url("/café", false)?, "/café");
437    Ok(())
438  }
439
440  #[test]
441  fn should_encode_all_required_special_characters() -> Result<()> {
442    assert_eq!(sanitize_url("/test<script>", false)?, "/test%3Cscript%3E");
443    assert_eq!(sanitize_url("/test{json}", false)?, "/test%7Bjson%7D");
444    assert_eq!(sanitize_url("/test|pipe", false)?, "/test%7Cpipe");
445    assert_eq!(sanitize_url("/test^caret", false)?, "/test%5Ecaret");
446    assert_eq!(sanitize_url("/test`backtick", false)?, "/test%60backtick");
447    Ok(())
448  }
449
450  // Whitespace and control character tests
451  #[test]
452  fn should_handle_whitespace_characters() -> Result<()> {
453    assert_eq!(sanitize_url("/test path", false)?, "/test path");
454    assert_eq!(sanitize_url("/test\tpath", false)?, "/test\tpath");
455    assert_eq!(sanitize_url("/test\npath", false)?, "/test\npath");
456    assert_eq!(sanitize_url("/test\rpath", false)?, "/test\rpath");
457    Ok(())
458  }
459
460  #[test]
461  fn should_handle_encoded_whitespace() -> Result<()> {
462    assert_eq!(sanitize_url("/test%20path", false)?, "/test%20path");
463    assert_eq!(sanitize_url("/test%09path", false)?, "/test%09path"); // Tab
464    assert_eq!(sanitize_url("/test%0Apath", false)?, "/test%0Apath"); // LF
465    assert_eq!(sanitize_url("/test%0Dpath", false)?, "/test%0Dpath"); // CR
466    Ok(())
467  }
468
469  // Query parameters and fragments (if they should be handled)
470  #[test]
471  fn should_handle_query_and_fragment_characters() -> Result<()> {
472    assert_eq!(sanitize_url("/path?query=value", false)?, "/path?query=value");
473    assert_eq!(sanitize_url("/path#fragment", false)?, "/path#fragment");
474    assert_eq!(sanitize_url("/path?q=1&b=2#frag", false)?, "/path?q=1&b=2#frag");
475    Ok(())
476  }
477
478  // Very long paths
479  #[test]
480  fn should_handle_long_paths() -> Result<()> {
481    let long_segment = "a".repeat(1000);
482    let long_path = format!("/{long_segment}");
483    assert_eq!(sanitize_url(&long_path, false)?, long_path);
484    Ok(())
485  }
486
487  // Mixed separators
488  #[test]
489  fn should_handle_mixed_separators() -> Result<()> {
490    assert_eq!(sanitize_url("/test\\path/to\\file", false)?, "/test/path/to/file");
491    assert_eq!(sanitize_url("\\test/path\\to/file\\", false)?, "/test/path/to/file/");
492    Ok(())
493  }
494
495  // Empty segments
496  #[test]
497  fn should_handle_empty_segments() -> Result<()> {
498    assert_eq!(sanitize_url("/a//b", false)?, "/a/b");
499    assert_eq!(sanitize_url("/a///b", false)?, "/a/b");
500    assert_eq!(sanitize_url("///a///b///", false)?, "/a/b/");
501    Ok(())
502  }
503
504  // Case sensitivity
505  #[test]
506  fn should_preserve_case() -> Result<()> {
507    assert_eq!(sanitize_url("/TeSt/PaTh", false)?, "/TeSt/PaTh");
508    assert_eq!(sanitize_url("/TEST/path", false)?, "/TEST/path");
509    Ok(())
510  }
511
512  // Boundary conditions for hex values
513  #[test]
514  fn should_handle_hex_boundary_values() -> Result<()> {
515    assert_eq!(sanitize_url("/test%00", false)?, "/test"); // Null byte - should be removed
516    assert_eq!(sanitize_url("/test%01", false)?, "/test%01"); // Control char - should stay encoded
517    assert_eq!(sanitize_url("/test%7F", false)?, "/test%7F"); // DEL - should stay encoded
518    assert_eq!(sanitize_url("/test%80", false)?, "/test%80"); // High bit set - should stay encoded
519    Ok(())
520  }
521
522  // Real-world examples
523  #[test]
524  fn should_handle_realistic_paths() -> Result<()> {
525    assert_eq!(sanitize_url("/api/v1/users/123", false)?, "/api/v1/users/123");
526    assert_eq!(sanitize_url("/static/css/main.css", false)?, "/static/css/main.css");
527    assert_eq!(
528      sanitize_url("/uploads/2023/12/image.jpg", false)?,
529      "/uploads/2023/12/image.jpg"
530    );
531    assert_eq!(sanitize_url("/docs/README.md", false)?, "/docs/README.md");
532    Ok(())
533  }
534
535  // Security-related tests
536  #[test]
537  fn should_prevent_directory_traversal_attacks() -> Result<()> {
538    assert_eq!(sanitize_url("/../../../etc/passwd", false)?, "/etc/passwd");
539    assert_eq!(sanitize_url("/app/../../../etc/passwd", false)?, "/etc/passwd");
540    assert_eq!(
541      sanitize_url("/safe/path/../../../../../../etc/passwd", false)?,
542      "/etc/passwd"
543    );
544    Ok(())
545  }
546
547  // Test interaction between features
548  #[test]
549  fn should_handle_complex_combinations() -> Result<()> {
550    // Null bytes + encoding + navigation + special chars
551    assert_eq!(sanitize_url("/test%00/../path%3C%3E%7C", false)?, "/path%3C%3E%7C");
552
553    // Backslashes + dots + encoding
554    assert_eq!(sanitize_url("test\\..\\path%21\\file.txt", false)?, "/path!/file.txt");
555    Ok(())
556  }
557}