1use anyhow::{anyhow, Result};
15use smallvec::SmallVec;
16
17static SAFE_CHARS: [bool; 256] = {
19 let mut table = [false; 256];
20 let safe_bytes = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!$&'()*+,-./:;=@[]_~";
21 let mut i = 0;
22 while i < safe_bytes.len() {
23 table[safe_bytes[i] as usize] = true;
24 i += 1;
25 }
26 table
27};
28
29static HEX_CHARS: [u8; 16] = *b"0123456789ABCDEF";
31
32pub fn sanitize_url(resource: &str, allow_double_slashes: bool) -> Result<String> {
34 if resource == "*" || resource.is_empty() {
35 return Ok(resource.to_string());
36 }
37
38 let bytes = resource.as_bytes();
39 let mut result = SmallVec::<[u8; 256]>::with_capacity(bytes.len() * 2);
40
41 let mut i = 0;
43 while i < bytes.len() {
44 let byte = bytes[i];
45
46 if byte == 0 {
48 i += 1;
49 continue;
50 }
51
52 if byte == b'%' {
53 if i + 2 >= bytes.len() {
55 return Err(anyhow!("URI malformed"));
56 }
57
58 let hi = bytes[i + 1];
59 let lo = bytes[i + 2];
60
61 if !hi.is_ascii_hexdigit() || !lo.is_ascii_hexdigit() {
62 return Err(anyhow!("URI malformed"));
63 }
64
65 let value = hex_to_byte_fast(hi, lo)?;
66 if value == 0xc0 || value == 0xc1 || value >= 0xfe {
67 return Err(anyhow!("URI malformed"));
68 }
69
70 if value == 0 {
72 i += 3;
74 continue;
75 } else if SAFE_CHARS[value as usize] {
76 result.push(value);
77 } else {
78 result.push(b'%');
79 result.push(hi);
80 result.push(lo);
81 }
82 i += 3;
83 } else {
84 match byte {
86 b'<' | b'>' | b'^' | b'`' | b'{' | b'|' | b'}' => {
87 result.push(b'%');
88 result.push(HEX_CHARS[(byte >> 4) as usize]);
89 result.push(HEX_CHARS[(byte & 0xF) as usize]);
90 }
91 _ => result.push(byte),
92 }
93 i += 1;
94 }
95 }
96
97 if result.is_empty() || result[0] != b'/' {
99 result.insert(0, b'/');
100 } else if allow_double_slashes && bytes.len() >= 2 && bytes[0] == b'/' && bytes[1] == b'/' {
101 if result.len() >= 2 && result[0] == b'/' && result[1] != b'/' {
104 result.insert(1, b'/');
105 }
106 }
107
108 let mut segments = SmallVec::<[SmallVec<[u8; 32]>; 16]>::new();
110 let mut current_segment = SmallVec::<[u8; 32]>::new();
111 let mut last_was_slash = true; i = if allow_double_slashes && result.len() >= 2 && result[0] == b'/' && result[1] == b'/' {
114 2 } else {
116 1 };
118
119 while i < result.len() {
120 let byte = result[i];
121
122 if byte == b'\\' || byte == b'/' {
123 if !current_segment.is_empty() {
124 if current_segment.as_slice() != b"." && current_segment.as_slice() != b".." {
126 while let Some(&b'.') = current_segment.last() {
127 current_segment.pop();
128 }
129 }
130
131 if !current_segment.is_empty() {
132 segments.push(current_segment);
133 current_segment = SmallVec::new();
134 }
135 }
136
137 if allow_double_slashes && last_was_slash {
138 segments.push(SmallVec::new());
140 }
141 last_was_slash = true;
142 } else {
143 current_segment.push(byte);
144 last_was_slash = false;
145 }
146 i += 1;
147 }
148
149 if !current_segment.is_empty() {
151 if current_segment.as_slice() != b"." && current_segment.as_slice() != b".." {
153 while let Some(&b'.') = current_segment.last() {
154 current_segment.pop();
155 }
156 }
157 if !current_segment.is_empty() {
158 segments.push(current_segment);
159 }
160 }
161
162 let mut final_segments = SmallVec::<[SmallVec<[u8; 32]>; 16]>::new();
164 for segment in segments {
165 if segment.is_empty() && allow_double_slashes {
166 final_segments.push(segment);
167 } else if segment.as_slice() == b"." {
168 continue;
170 } else if segment.as_slice() == b".." {
171 final_segments.pop();
173 } else if !segment.is_empty() {
174 final_segments.push(segment);
175 }
176 }
177
178 let mut final_result = SmallVec::<[u8; 256]>::with_capacity(result.len());
180
181 if allow_double_slashes && bytes.len() >= 2 && bytes[0] == b'/' && bytes[1] == b'/' {
183 final_result.push(b'/');
184 final_result.push(b'/');
185 } else {
186 final_result.push(b'/');
187 }
188
189 let preserve_trailing_slash =
190 !result.is_empty() && (result[result.len() - 1] == b'/' || result[result.len() - 1] == b'\\');
191
192 for (idx, segment) in final_segments.iter().enumerate() {
193 if idx > 0 || (allow_double_slashes && segment.is_empty()) {
194 final_result.push(b'/');
195 }
196 final_result.extend_from_slice(segment);
197 }
198
199 if preserve_trailing_slash && !final_segments.is_empty() {
202 final_result.push(b'/');
203 }
204
205 String::from_utf8(final_result.into_vec()).map_err(|_| anyhow!("Invalid UTF-8 in result"))
207}
208
209#[inline(always)]
210fn hex_to_byte_fast(hi: u8, lo: u8) -> Result<u8> {
211 #[inline(always)]
212 fn hex_val(c: u8) -> Option<u8> {
213 match c {
214 b'0'..=b'9' => Some(c - b'0'),
215 b'a'..=b'f' => Some(10 + (c - b'a')),
216 b'A'..=b'F' => Some(10 + (c - b'A')),
217 _ => None,
218 }
219 }
220 match (hex_val(hi), hex_val(lo)) {
221 (Some(h), Some(l)) => Ok(h << 4 | l),
222 _ => Err(anyhow!("Invalid hex")),
223 }
224}
225
226#[cfg(test)]
228mod tests {
229 use super::*;
230 use anyhow::Result;
231
232 #[test]
233 fn should_not_change_slash() -> Result<()> {
234 assert_eq!(sanitize_url("/", false)?, "/");
235 Ok(())
236 }
237
238 #[test]
239 fn should_return_asterisk_for_asterisk() -> Result<()> {
240 assert_eq!(sanitize_url("*", false)?, "*");
241 Ok(())
242 }
243
244 #[test]
245 fn should_return_empty_string_for_empty_string() -> Result<()> {
246 assert_eq!(sanitize_url("", false)?, "");
247 Ok(())
248 }
249
250 #[test]
251 fn should_remove_null_characters() -> Result<()> {
252 assert_eq!(sanitize_url("/test%00", false)?, "/test");
253 assert_eq!(sanitize_url("/test\0", false)?, "/test");
254 Ok(())
255 }
256
257 #[test]
258 fn should_throw_uri_error_for_malformed_url() {
259 assert!(sanitize_url("%c0%af", false).is_err());
260 assert!(sanitize_url("%u002f", false).is_err());
261 assert!(sanitize_url("%as", false).is_err());
262 }
263
264 #[test]
265 fn should_ensure_the_resource_starts_with_a_slash() -> Result<()> {
266 assert_eq!(sanitize_url("test", false)?, "/test");
267 Ok(())
268 }
269
270 #[test]
271 fn should_convert_backslashes_to_slashes() -> Result<()> {
272 assert_eq!(sanitize_url("test\\path", false)?, "/test/path");
273 Ok(())
274 }
275
276 #[test]
277 fn should_handle_duplicate_slashes() -> Result<()> {
278 assert_eq!(sanitize_url("test//path", false)?, "/test/path");
279 assert_eq!(sanitize_url("test//path", true)?, "/test//path");
280 Ok(())
281 }
282
283 #[test]
284 fn should_handle_relative_navigation() -> Result<()> {
285 assert_eq!(sanitize_url("/./test", false)?, "/test");
286 assert_eq!(sanitize_url("/../test", false)?, "/test");
287 assert_eq!(sanitize_url("../test", false)?, "/test");
288 assert_eq!(sanitize_url("./test", false)?, "/test");
289 assert_eq!(sanitize_url("/test/./", false)?, "/test/");
290 assert_eq!(sanitize_url("/test/../", false)?, "/");
291 assert_eq!(sanitize_url("/test/../path", false)?, "/path");
292 Ok(())
293 }
294
295 #[test]
296 fn should_remove_trailing_dots_in_paths() -> Result<()> {
297 assert_eq!(sanitize_url("/test...", false)?, "/test");
298 assert_eq!(sanitize_url("/test.../", false)?, "/test/");
299 Ok(())
300 }
301
302 #[test]
303 fn should_return_slash_for_empty_sanitized_resource() -> Result<()> {
304 assert_eq!(sanitize_url("/../..", false)?, "/");
305 Ok(())
306 }
307
308 #[test]
309 fn should_encode_special_characters() -> Result<()> {
310 assert_eq!(sanitize_url("/test<path>", false)?, "/test%3Cpath%3E");
311 assert_eq!(sanitize_url("/test^path", false)?, "/test%5Epath");
312 assert_eq!(sanitize_url("/test`path", false)?, "/test%60path");
313 assert_eq!(sanitize_url("/test{path}", false)?, "/test%7Bpath%7D");
314 assert_eq!(sanitize_url("/test|path", false)?, "/test%7Cpath");
315 Ok(())
316 }
317
318 #[test]
319 fn should_preserve_certain_characters() -> Result<()> {
320 assert_eq!(sanitize_url("/test!path", false)?, "/test!path");
321 assert_eq!(sanitize_url("/test$path", false)?, "/test$path");
322 assert_eq!(sanitize_url("/test&path", false)?, "/test&path");
323 assert_eq!(sanitize_url("/test-path", false)?, "/test-path");
324 assert_eq!(sanitize_url("/test=path", false)?, "/test=path");
325 assert_eq!(sanitize_url("/test@path", false)?, "/test@path");
326 assert_eq!(sanitize_url("/test_path", false)?, "/test_path");
327 assert_eq!(sanitize_url("/test~path", false)?, "/test~path");
328 Ok(())
329 }
330
331 #[test]
332 fn should_decode_url_encoded_characters_while_preserving_certain_characters() -> Result<()> {
333 assert_eq!(sanitize_url("/test%20path", false)?, "/test%20path");
334 assert_eq!(sanitize_url("/test%21path", false)?, "/test!path");
335 assert_eq!(sanitize_url("/test%22path", false)?, "/test%22path");
336 assert_eq!(sanitize_url("/test%24path", false)?, "/test$path");
337 assert_eq!(sanitize_url("/test%25path", false)?, "/test%25path");
338 assert_eq!(sanitize_url("/test%26path", false)?, "/test&path");
339 assert_eq!(sanitize_url("/test%2Dpath", false)?, "/test-path");
340 assert_eq!(sanitize_url("/test%3Cpath", false)?, "/test%3Cpath");
341 assert_eq!(sanitize_url("/test%3Dpath", false)?, "/test=path");
342 assert_eq!(sanitize_url("/test%3Epath", false)?, "/test%3Epath");
343 assert_eq!(sanitize_url("/test%40path", false)?, "/test@path");
344 assert_eq!(sanitize_url("/test%5Fpath", false)?, "/test_path");
345 assert_eq!(sanitize_url("/test%7Dpath", false)?, "/test%7Dpath");
346 assert_eq!(sanitize_url("/test%7Epath", false)?, "/test~path");
347 Ok(())
348 }
349
350 #[test]
351 fn should_decode_url_encoded_alphanumeric_characters_while_preserving_certain_characters() -> Result<()> {
352 assert_eq!(sanitize_url("/conf%69g.json", false)?, "/config.json");
353 assert_eq!(sanitize_url("/CONF%49G.JSON", false)?, "/CONFIG.JSON");
354 assert_eq!(sanitize_url("/svr%32.js", false)?, "/svr2.js");
355 assert_eq!(sanitize_url("/%73%76%72%32%2E%6A%73", false)?, "/svr2.js");
356 Ok(())
357 }
358
359 #[test]
360 fn should_decode_url_encoded_characters_regardless_of_the_letter_case_of_the_url_encoding() -> Result<()> {
361 assert_eq!(sanitize_url("/%5f", false)?, "/_");
362 assert_eq!(sanitize_url("/%5F", false)?, "/_");
363 Ok(())
364 }
365
366 #[test]
368 fn should_handle_incomplete_percent_encoding() {
369 assert!(sanitize_url("/test%2", false).is_err());
370 assert!(sanitize_url("/test%", false).is_err());
371 assert!(sanitize_url("/test%G", false).is_err());
372 assert!(sanitize_url("/test%2G", false).is_err());
373 }
374
375 #[test]
376 fn should_handle_invalid_utf8_bytes() {
377 assert!(sanitize_url("/test%C0%80", false).is_err()); assert!(sanitize_url("/test%C1%BF", false).is_err()); assert!(sanitize_url("/test%FE%FF", false).is_err()); assert!(sanitize_url("/test%FF%FE", false).is_err()); }
383
384 #[test]
386 fn should_handle_complex_relative_paths() -> Result<()> {
387 assert_eq!(sanitize_url("/a/b/c/../../d", false)?, "/a/d");
388 assert_eq!(sanitize_url("/a/./b/../c/./d/..", false)?, "/a/c");
389 assert_eq!(sanitize_url("/a/b/c/../../../..", false)?, "/");
390 assert_eq!(sanitize_url("./../../a/b/c", false)?, "/a/b/c");
391 Ok(())
392 }
393
394 #[test]
395 fn should_handle_dots_in_filenames() -> Result<()> {
396 assert_eq!(sanitize_url("/file.txt", false)?, "/file.txt");
397 assert_eq!(sanitize_url("/file.backup.txt", false)?, "/file.backup.txt");
398 assert_eq!(sanitize_url("/hidden/.config", false)?, "/hidden/.config");
399 assert_eq!(sanitize_url("/.htaccess", false)?, "/.htaccess");
400 Ok(())
401 }
402
403 #[test]
404 fn should_preserve_double_slashes_when_allowed() -> Result<()> {
405 assert_eq!(sanitize_url("//server/share", true)?, "//server/share");
406 assert_eq!(sanitize_url("/path//to///file", true)?, "/path//to///file");
407 assert_eq!(sanitize_url("test///path", true)?, "/test///path");
408 Ok(())
409 }
410
411 #[test]
413 fn should_handle_trailing_slashes_correctly() -> Result<()> {
414 assert_eq!(sanitize_url("/test/", false)?, "/test/");
415 assert_eq!(sanitize_url("/test/path/", false)?, "/test/path/");
416 assert_eq!(sanitize_url("/test\\", false)?, "/test/");
417 assert_eq!(sanitize_url("/test/./", false)?, "/test/");
418 assert_eq!(sanitize_url("/test/../", false)?, "/");
419 Ok(())
420 }
421
422 #[test]
423 fn should_not_add_trailing_slash_to_root() -> Result<()> {
424 assert_eq!(sanitize_url("/", false)?, "/");
425 assert_eq!(sanitize_url("//", false)?, "/");
426 assert_eq!(sanitize_url("///", false)?, "/");
427 Ok(())
428 }
429
430 #[test]
432 fn should_handle_unicode_characters() -> Result<()> {
433 assert_eq!(sanitize_url("/测试", false)?, "/测试");
434 assert_eq!(sanitize_url("/тест", false)?, "/тест");
435 assert_eq!(sanitize_url("/🚀", false)?, "/🚀");
436 assert_eq!(sanitize_url("/café", false)?, "/café");
437 Ok(())
438 }
439
440 #[test]
441 fn should_encode_all_required_special_characters() -> Result<()> {
442 assert_eq!(sanitize_url("/test<script>", false)?, "/test%3Cscript%3E");
443 assert_eq!(sanitize_url("/test{json}", false)?, "/test%7Bjson%7D");
444 assert_eq!(sanitize_url("/test|pipe", false)?, "/test%7Cpipe");
445 assert_eq!(sanitize_url("/test^caret", false)?, "/test%5Ecaret");
446 assert_eq!(sanitize_url("/test`backtick", false)?, "/test%60backtick");
447 Ok(())
448 }
449
450 #[test]
452 fn should_handle_whitespace_characters() -> Result<()> {
453 assert_eq!(sanitize_url("/test path", false)?, "/test path");
454 assert_eq!(sanitize_url("/test\tpath", false)?, "/test\tpath");
455 assert_eq!(sanitize_url("/test\npath", false)?, "/test\npath");
456 assert_eq!(sanitize_url("/test\rpath", false)?, "/test\rpath");
457 Ok(())
458 }
459
460 #[test]
461 fn should_handle_encoded_whitespace() -> Result<()> {
462 assert_eq!(sanitize_url("/test%20path", false)?, "/test%20path");
463 assert_eq!(sanitize_url("/test%09path", false)?, "/test%09path"); assert_eq!(sanitize_url("/test%0Apath", false)?, "/test%0Apath"); assert_eq!(sanitize_url("/test%0Dpath", false)?, "/test%0Dpath"); Ok(())
467 }
468
469 #[test]
471 fn should_handle_query_and_fragment_characters() -> Result<()> {
472 assert_eq!(sanitize_url("/path?query=value", false)?, "/path?query=value");
473 assert_eq!(sanitize_url("/path#fragment", false)?, "/path#fragment");
474 assert_eq!(sanitize_url("/path?q=1&b=2#frag", false)?, "/path?q=1&b=2#frag");
475 Ok(())
476 }
477
478 #[test]
480 fn should_handle_long_paths() -> Result<()> {
481 let long_segment = "a".repeat(1000);
482 let long_path = format!("/{long_segment}");
483 assert_eq!(sanitize_url(&long_path, false)?, long_path);
484 Ok(())
485 }
486
487 #[test]
489 fn should_handle_mixed_separators() -> Result<()> {
490 assert_eq!(sanitize_url("/test\\path/to\\file", false)?, "/test/path/to/file");
491 assert_eq!(sanitize_url("\\test/path\\to/file\\", false)?, "/test/path/to/file/");
492 Ok(())
493 }
494
495 #[test]
497 fn should_handle_empty_segments() -> Result<()> {
498 assert_eq!(sanitize_url("/a//b", false)?, "/a/b");
499 assert_eq!(sanitize_url("/a///b", false)?, "/a/b");
500 assert_eq!(sanitize_url("///a///b///", false)?, "/a/b/");
501 Ok(())
502 }
503
504 #[test]
506 fn should_preserve_case() -> Result<()> {
507 assert_eq!(sanitize_url("/TeSt/PaTh", false)?, "/TeSt/PaTh");
508 assert_eq!(sanitize_url("/TEST/path", false)?, "/TEST/path");
509 Ok(())
510 }
511
512 #[test]
514 fn should_handle_hex_boundary_values() -> Result<()> {
515 assert_eq!(sanitize_url("/test%00", false)?, "/test"); assert_eq!(sanitize_url("/test%01", false)?, "/test%01"); assert_eq!(sanitize_url("/test%7F", false)?, "/test%7F"); assert_eq!(sanitize_url("/test%80", false)?, "/test%80"); Ok(())
520 }
521
522 #[test]
524 fn should_handle_realistic_paths() -> Result<()> {
525 assert_eq!(sanitize_url("/api/v1/users/123", false)?, "/api/v1/users/123");
526 assert_eq!(sanitize_url("/static/css/main.css", false)?, "/static/css/main.css");
527 assert_eq!(
528 sanitize_url("/uploads/2023/12/image.jpg", false)?,
529 "/uploads/2023/12/image.jpg"
530 );
531 assert_eq!(sanitize_url("/docs/README.md", false)?, "/docs/README.md");
532 Ok(())
533 }
534
535 #[test]
537 fn should_prevent_directory_traversal_attacks() -> Result<()> {
538 assert_eq!(sanitize_url("/../../../etc/passwd", false)?, "/etc/passwd");
539 assert_eq!(sanitize_url("/app/../../../etc/passwd", false)?, "/etc/passwd");
540 assert_eq!(
541 sanitize_url("/safe/path/../../../../../../etc/passwd", false)?,
542 "/etc/passwd"
543 );
544 Ok(())
545 }
546
547 #[test]
549 fn should_handle_complex_combinations() -> Result<()> {
550 assert_eq!(sanitize_url("/test%00/../path%3C%3E%7C", false)?, "/path%3C%3E%7C");
552
553 assert_eq!(sanitize_url("test\\..\\path%21\\file.txt", false)?, "/path!/file.txt");
555 Ok(())
556 }
557}