regorus/
lexer.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4// SAFETY: Arithmetic operations in this module are safe by design:
5// 1. MAX_COL=1024 prevents column counter overflow (enforced by advance_col)
6// 2. File size is capped by MAX_FILE_BYTES at load time
7// 3. Total line count is capped by MAX_LINES at load time
8// 4. State-modifying operations (advance_col/advance_line) use checked arithmetic
9// 5. Remaining arithmetic is for bounded calculations (spans, error reporting)
10//    where operands are constrained by MAX_COL and file size/line limits
11// 6. Defensive saturating_sub used for subtractions that could theoretically underflow
12use crate::*;
13use core::cmp;
14use core::fmt::{self, Debug, Formatter};
15use core::iter::Peekable;
16use core::ops::Range;
17use core::str::CharIndices;
18
19use crate::Value;
20
21use anyhow::{anyhow, bail, Result};
22
23#[inline]
24fn check_memory_limit() -> Result<()> {
25    crate::utils::limits::check_memory_limit_if_needed().map_err(|err| anyhow!(err))
26}
27
28// Maximum column width to prevent overflow and catch pathological input.
29// Lines exceeding this are likely minified/generated code or attack attempts.
30const MAX_COL: u32 = 1024;
31// Maximum allowed policy file size in bytes (1 MiB) to reject pathological inputs early.
32const MAX_FILE_BYTES: usize = 1_048_576;
33// Maximum allowed number of lines to avoid pathological or minified inputs.
34const MAX_LINES: usize = 20_000;
35
36#[inline]
37fn usize_to_u32(value: usize) -> Result<u32> {
38    u32::try_from(value).map_err(|_| anyhow!("value exceeds u32::MAX"))
39}
40
41#[inline]
42fn span_range(start: u32, end: u32) -> Option<Range<usize>> {
43    let s = usize::try_from(start).ok()?;
44    let e = usize::try_from(end).ok()?;
45    Some(s..e)
46}
47
48#[derive(Clone)]
49#[cfg_attr(feature = "ast", derive(serde::Serialize))]
50struct SourceInternal {
51    pub file: String,
52    pub contents: String,
53    #[cfg_attr(feature = "ast", serde(skip_serializing))]
54    pub lines: Vec<(u32, u32)>,
55}
56
57/// A policy file.
58#[derive(Clone)]
59#[cfg_attr(feature = "ast", derive(serde::Serialize))]
60pub struct Source {
61    #[cfg_attr(feature = "ast", serde(flatten))]
62    src: Rc<SourceInternal>,
63}
64
65impl Source {
66    /// The path associated with the policy file.
67    pub fn get_path(&self) -> &String {
68        &self.src.file
69    }
70
71    /// The contents of the policy file.
72    pub fn get_contents(&self) -> &String {
73        &self.src.contents
74    }
75}
76
77impl cmp::Ord for Source {
78    fn cmp(&self, other: &Source) -> cmp::Ordering {
79        Rc::as_ptr(&self.src).cmp(&Rc::as_ptr(&other.src))
80    }
81}
82
83impl cmp::PartialOrd for Source {
84    fn partial_cmp(&self, other: &Source) -> Option<cmp::Ordering> {
85        Some(self.cmp(other))
86    }
87}
88
89impl cmp::PartialEq for Source {
90    fn eq(&self, other: &Source) -> bool {
91        Rc::as_ptr(&self.src) == Rc::as_ptr(&other.src)
92    }
93}
94
95impl cmp::Eq for Source {}
96
97#[cfg(feature = "std")]
98impl core::hash::Hash for Source {
99    fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
100        Rc::as_ptr(&self.src).hash(state);
101    }
102}
103
104impl Debug for Source {
105    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
106        self.src.file.fmt(f)
107    }
108}
109
110#[derive(Clone)]
111pub struct SourceStr {
112    source: Source,
113    start: u32,
114    end: u32,
115}
116
117impl Debug for SourceStr {
118    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
119        self.text().fmt(f)
120    }
121}
122
123impl fmt::Display for SourceStr {
124    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
125        fmt::Display::fmt(&self.text(), f)
126    }
127}
128
129impl SourceStr {
130    pub const fn new(source: Source, start: u32, end: u32) -> Self {
131        Self { source, start, end }
132    }
133
134    pub fn text(&self) -> &str {
135        // Use safe slicing to avoid panics on malformed spans
136        span_range(self.start, self.end).map_or("<invalid-span>", |range| {
137            self.source
138                .contents()
139                .get(range)
140                .unwrap_or("<invalid-span>")
141        })
142    }
143
144    pub fn clone_empty(&self) -> SourceStr {
145        Self {
146            source: self.source.clone(),
147            start: 0,
148            end: 0,
149        }
150    }
151}
152
153impl cmp::PartialEq for SourceStr {
154    fn eq(&self, other: &Self) -> bool {
155        self.text().eq(other.text())
156    }
157}
158
159impl cmp::Eq for SourceStr {}
160
161impl cmp::PartialOrd for SourceStr {
162    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
163        Some(self.cmp(other))
164    }
165}
166
167impl cmp::Ord for SourceStr {
168    fn cmp(&self, other: &Self) -> cmp::Ordering {
169        self.text().cmp(other.text())
170    }
171}
172
173impl Source {
174    pub fn from_contents(file: String, contents: String) -> Result<Source> {
175        if contents.len() > MAX_FILE_BYTES {
176            bail!("{file} exceeds maximum allowed policy file size {MAX_FILE_BYTES} bytes");
177        }
178        let mut lines = vec![];
179        let mut prev_ch = ' ';
180        let mut prev_pos = 0_u32;
181        let mut start = 0_u32;
182        for (i, ch) in contents.char_indices() {
183            let i_u32 = usize_to_u32(i)?;
184            if ch == '\n' {
185                let end = match prev_ch {
186                    '\r' => prev_pos,
187                    _ => i_u32,
188                };
189                if lines.len() >= MAX_LINES {
190                    bail!("{file} exceeds maximum allowed line count {MAX_LINES}");
191                }
192                lines.push((start, end));
193                // Enforce the current global memory cap after recording each line span.
194                check_memory_limit()?;
195                start = i_u32.saturating_add(1);
196            }
197            prev_ch = ch;
198            prev_pos = i_u32;
199        }
200
201        let start_usize = usize::try_from(start).unwrap_or(usize::MAX);
202        if start_usize < contents.len() {
203            if lines.len() >= MAX_LINES {
204                bail!("{file} exceeds maximum allowed line count {MAX_LINES}");
205            }
206            lines.push((start, usize_to_u32(contents.len())?));
207            // Enforce the global limit after appending the final line span.
208            check_memory_limit()?;
209        } else if contents.is_empty() {
210            lines.push((0, 0));
211            // Enforce the global limit even for empty sources.
212            check_memory_limit()?;
213        } else {
214            let s = usize_to_u32(contents.len().saturating_sub(1))?;
215            if lines.len() >= MAX_LINES {
216                bail!("{file} exceeds maximum allowed line count {MAX_LINES}");
217            }
218            lines.push((s, s));
219            // Enforce the global limit after storing the trailing span.
220            check_memory_limit()?;
221        }
222        Ok(Self {
223            src: Rc::new(SourceInternal {
224                file,
225                contents,
226                lines,
227            }),
228        })
229    }
230
231    #[cfg(feature = "std")]
232    pub fn from_file<P: AsRef<std::path::Path>>(path: P) -> Result<Source> {
233        let contents = match std::fs::read_to_string(&path) {
234            Ok(c) => c,
235            Err(e) => bail!("Failed to read {}. {e}", path.as_ref().display()),
236        };
237        // TODO: retain path instead of converting to string
238        Self::from_contents(path.as_ref().to_string_lossy().to_string(), contents)
239    }
240
241    pub fn file(&self) -> &String {
242        &self.src.file
243    }
244    pub fn contents(&self) -> &String {
245        &self.src.contents
246    }
247    pub fn line(&self, idx: u32) -> &str {
248        let idx = usize::try_from(idx).unwrap_or(usize::MAX);
249        match self.src.lines.get(idx) {
250            Some(&(start, end)) => self
251                .src
252                .contents
253                .get(span_range(start, end).unwrap_or(0..0))
254                .unwrap_or(""),
255            None => "",
256        }
257    }
258
259    pub fn message(&self, line: u32, col: u32, kind: &str, msg: &str) -> String {
260        if usize::try_from(line).unwrap_or(usize::MAX) > self.src.lines.len() {
261            return format!("{}: invalid line {} specified", self.src.file, line);
262        }
263
264        let line_str = format!("{line}");
265        let line_num_width = line_str.len().saturating_add(1);
266        let col_spaces = usize::try_from(col).unwrap_or(0).saturating_sub(1);
267
268        format!(
269            "\n--> {}:{}:{}\n{:<line_num_width$}|\n\
270		{:<line_num_width$}| {}\n\
271		{:<line_num_width$}| {:<col_spaces$}^\n\
272		{}: {}",
273            self.src.file,
274            line,
275            col,
276            "",
277            line,
278            self.line(line.saturating_sub(1)),
279            "",
280            "",
281            kind,
282            msg
283        )
284    }
285
286    pub fn error(&self, line: u32, col: u32, msg: &str) -> anyhow::Error {
287        anyhow!(self.message(line, col, "error", msg))
288    }
289}
290
291#[derive(Clone)]
292#[cfg_attr(feature = "ast", derive(serde::Serialize))]
293pub struct Span {
294    #[cfg_attr(feature = "ast", serde(skip_serializing))]
295    pub source: Source,
296    pub line: u32,
297    pub col: u32,
298    pub start: u32,
299    pub end: u32,
300}
301
302impl Span {
303    pub fn text(&self) -> &str {
304        // Use safe slicing to avoid panics on malformed spans
305        span_range(self.start, self.end).map_or("<invalid-span>", |range| {
306            self.source
307                .contents()
308                .get(range)
309                .unwrap_or("<invalid-span>")
310        })
311    }
312
313    pub fn source_str(&self) -> SourceStr {
314        SourceStr::new(self.source.clone(), self.start, self.end)
315    }
316
317    pub fn message(&self, kind: &str, msg: &str) -> String {
318        self.source.message(self.line, self.col, kind, msg)
319    }
320
321    pub fn error(&self, msg: &str) -> anyhow::Error {
322        self.source.error(self.line, self.col, msg)
323    }
324}
325
326impl Debug for Span {
327    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
328        let t = self.text().escape_debug().to_string();
329        let max = 32;
330        let (txt, trailer) = if t.len() > max {
331            (&t[0..max], "...")
332        } else {
333            (t.as_str(), "")
334        };
335
336        f.write_fmt(format_args!(
337            "{}:{}:{}:{}, \"{}{}\"",
338            self.line, self.col, self.start, self.end, txt, trailer
339        ))
340    }
341}
342
343#[cfg(feature = "azure-rbac")]
344#[derive(Debug, PartialEq, Eq, Clone)]
345pub enum AzureRbacTokenKind {
346    At,         // @ symbol for attribute sources (@Request, @Resource, etc.)
347    LogicalAnd, // && operator
348    LogicalOr,  // || operator
349}
350
351#[derive(Debug, PartialEq, Eq, Clone)]
352pub enum TokenKind {
353    Symbol,
354    String,
355    RawString,
356    Number,
357    Ident,
358    Eof,
359    // Azure RBAC-specific tokens
360    #[cfg(feature = "azure-rbac")]
361    AzureRbac(AzureRbacTokenKind),
362}
363
364#[derive(Debug, Clone)]
365pub struct Token(pub TokenKind, pub Span);
366
367#[derive(Clone)]
368pub struct Lexer<'source> {
369    source: Source,
370    iter: Peekable<CharIndices<'source>>,
371    line: u32,
372    col: u32,
373    unknown_char_is_symbol: bool,
374    allow_slash_star_escape: bool,
375    comment_starts_with_double_slash: bool,
376    double_colon_token: bool,
377    #[cfg(feature = "azure-rbac")]
378    enable_rbac_tokens: bool,
379    #[cfg(feature = "azure-rbac")]
380    allow_single_quoted_strings: bool,
381}
382
383impl<'source> fmt::Debug for Lexer<'source> {
384    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
385        f.debug_struct("Lexer").finish_non_exhaustive()
386    }
387}
388
389impl<'source> Lexer<'source> {
390    pub fn new(source: &'source Source) -> Self {
391        Self {
392            source: source.clone(),
393            iter: source.contents().char_indices().peekable(),
394            line: 1,
395            col: 1,
396            unknown_char_is_symbol: false,
397            allow_slash_star_escape: false,
398            comment_starts_with_double_slash: false,
399            double_colon_token: false,
400            #[cfg(feature = "azure-rbac")]
401            enable_rbac_tokens: false,
402            #[cfg(feature = "azure-rbac")]
403            allow_single_quoted_strings: false,
404        }
405    }
406
407    pub const fn set_unknown_char_is_symbol(&mut self, b: bool) {
408        self.unknown_char_is_symbol = b;
409    }
410
411    pub const fn set_allow_slash_star_escape(&mut self, b: bool) {
412        self.allow_slash_star_escape = b;
413    }
414
415    pub const fn set_comment_starts_with_double_slash(&mut self, b: bool) {
416        self.comment_starts_with_double_slash = b;
417    }
418
419    pub const fn set_double_colon_token(&mut self, b: bool) {
420        self.double_colon_token = b;
421    }
422
423    #[cfg(feature = "azure-rbac")]
424    pub const fn set_enable_rbac_tokens(&mut self, b: bool) {
425        self.enable_rbac_tokens = b;
426    }
427
428    #[cfg(feature = "azure-rbac")]
429    pub const fn set_allow_single_quoted_strings(&mut self, b: bool) {
430        self.allow_single_quoted_strings = b;
431    }
432
433    fn peek(&mut self) -> (usize, char) {
434        match self.iter.peek() {
435            Some(&(index, chr)) => (index, chr),
436            _ => (self.source.contents().len(), '\x00'),
437        }
438    }
439
440    #[inline]
441    fn advance_col(&mut self, delta: u32) -> Result<()> {
442        let new_col = self
443            .col
444            .checked_add(delta)
445            .filter(|&c| c <= MAX_COL)
446            .ok_or_else(|| {
447                self.source.error(
448                    self.line,
449                    self.col,
450                    &format!("line exceeds maximum column width of {MAX_COL}"),
451                )
452            })?;
453        self.col = new_col;
454        Ok(())
455    }
456
457    #[inline]
458    fn advance_line(&mut self, delta: u32) -> Result<()> {
459        self.line = self.line.checked_add(delta).ok_or_else(|| {
460            self.source
461                .error(self.line, self.col, "line number overflow")
462        })?;
463        Ok(())
464    }
465
466    fn peekahead(&mut self, n: usize) -> (usize, char) {
467        match self.iter.clone().nth(n) {
468            Some((index, chr)) => (index, chr),
469            _ => (self.source.contents().len(), '\x00'),
470        }
471    }
472
473    fn read_ident(&mut self) -> Result<Token> {
474        let start = self.peek().0;
475        let col = self.col;
476        loop {
477            let ch = self.peek().1;
478            if ch.is_ascii_alphanumeric() || ch == '_' {
479                self.iter.next();
480            } else {
481                break;
482            }
483        }
484        let end = self.peek().0;
485        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
486        Ok(Token(
487            TokenKind::Ident,
488            Span {
489                source: self.source.clone(),
490                line: self.line,
491                col,
492                start: usize_to_u32(start)?,
493                end: usize_to_u32(end)?,
494            },
495        ))
496    }
497
498    fn read_digits(&mut self) {
499        while self.peek().1.is_ascii_digit() {
500            self.iter.next();
501        }
502    }
503
504    // See https://www.json.org/json-en.html for number's grammar
505    fn read_number(&mut self) -> Result<Token> {
506        let (start, chr) = self.peek();
507        let col = self.col;
508        self.iter.next();
509
510        // Read integer part.
511        if chr != '0' {
512            // Starts with 1.. or 9. Read digits.
513            self.read_digits();
514        }
515
516        // Read fraction part
517        // . must be followed by at least 1 digit.
518        if self.peek().1 == '.' && self.peekahead(1).1.is_ascii_digit() {
519            self.iter.next(); // .
520            self.read_digits();
521        }
522
523        // Read exponent part
524        let exp_ch = self.peek().1;
525        if exp_ch == 'e' || exp_ch == 'E' {
526            self.iter.next();
527            // e must be followed by an optional sign and digits
528            if matches!(self.peek().1, '+' | '-') {
529                self.iter.next();
530            }
531            // Read digits. Absence of digit will be validated by serde later.
532            self.read_digits();
533        }
534
535        let end = self.peek().0;
536        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
537
538        // Check for invalid number.Valid number cannot be followed by
539        // these characters:
540        let trailing_ch = self.peek().1;
541        if trailing_ch == '_' || trailing_ch == '.' || trailing_ch.is_ascii_alphanumeric() {
542            return Err(self.source.error(self.line, self.col, "invalid number"));
543        }
544
545        // Ensure that the number is parsable in Rust.
546        let num_slice = self
547            .source
548            .contents()
549            .get(start..end)
550            .ok_or_else(|| self.source.error(self.line, col, "invalid number span"))?;
551
552        let parsed_number = match serde_json::from_str::<Value>(num_slice) {
553            Ok(value) => value,
554            Err(e) => {
555                let serde_msg = &e.to_string();
556                let msg = match &serde_msg {
557                    m if m.contains("out of range") => "out of range",
558                    m if m.contains("invalid number") => "invalid number",
559                    m if m.contains("expected value") => "expected value",
560                    m if m.contains("trailing characters") => "trailing characters",
561                    m => m.to_owned(),
562                };
563
564                bail!(
565                    "{} {}",
566                    self.source.error(
567                        self.line,
568                        col,
569                        "invalid number. serde_json cannot parse number:"
570                    ),
571                    msg
572                )
573            }
574        };
575
576        // Enforce the global memory limit after serde allocates the temporary Value.
577        check_memory_limit()?;
578        drop(parsed_number);
579
580        Ok(Token(
581            TokenKind::Number,
582            Span {
583                source: self.source.clone(),
584                line: self.line,
585                col,
586                start: usize_to_u32(start)?,
587                end: usize_to_u32(end)?,
588            },
589        ))
590    }
591
592    fn read_raw_string(&mut self) -> Result<Token> {
593        self.iter.next();
594        self.advance_col(1)?;
595        let (start, _) = self.peek();
596        let (line, col) = (self.line, self.col);
597        loop {
598            let (_, ch) = self.peek();
599            self.iter.next();
600            match ch {
601                '`' => {
602                    self.advance_col(1)?;
603                    break;
604                }
605                '\x00' => {
606                    return Err(self.source.error(line, col, "unmatched `"));
607                }
608                '\t' => self.advance_col(4)?,
609                '\n' => {
610                    self.advance_line(1)?;
611                    self.col = 1;
612                }
613                _ => self.advance_col(1)?,
614            }
615        }
616        let end = self.peek().0;
617        if end <= start {
618            // Guard against invalid span that would underflow end - 1
619            return Err(self.source.error(line, col, "invalid raw string span"));
620        }
621        check_memory_limit()?;
622        Ok(Token(
623            TokenKind::RawString,
624            Span {
625                source: self.source.clone(),
626                line,
627                col,
628                start: usize_to_u32(start)?,
629                end: usize_to_u32(end)?.saturating_sub(1),
630            },
631        ))
632    }
633
634    fn read_string(&mut self) -> Result<Token> {
635        let (line, col) = (self.line, self.col);
636        self.iter.next();
637        self.advance_col(1)?;
638        let (start, _) = self.peek();
639        loop {
640            let (offset, ch) = self.peek();
641            match ch {
642                '"' | '\x00' => {
643                    break;
644                }
645                '\\' => {
646                    self.iter.next();
647                    let (_, escape_ch) = self.peek();
648                    self.iter.next();
649                    match escape_ch {
650                        // json escape sequence
651                        '"' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't' => (),
652                        '*' if self.allow_slash_star_escape => (),
653                        'u' => {
654                            for _i in 0..4 {
655                                let (hex_offset, hex_ch) = self.peek();
656                                let rel = usize_to_u32(hex_offset.saturating_sub(start))?;
657                                let cursor_col = self.col.saturating_add(rel);
658                                if !hex_ch.is_ascii_hexdigit() {
659                                    return Err(self.source.error(
660                                        line,
661                                        cursor_col,
662                                        "invalid hex escape sequence",
663                                    ));
664                                }
665                                self.iter.next();
666                            }
667                        }
668                        _ => {
669                            let cursor_col = self
670                                .col
671                                .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
672                            return Err(self.source.error(
673                                line,
674                                cursor_col,
675                                "invalid escape sequence",
676                            ));
677                        }
678                    }
679                }
680                _ => {
681                    // check for valid json chars
682                    let cursor_col = self
683                        .col
684                        .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
685                    if !('\u{0020}'..='\u{10FFFF}').contains(&ch) {
686                        return Err(self.source.error(
687                            line,
688                            cursor_col,
689                            "invalid character in string",
690                        ));
691                    }
692                    self.iter.next();
693                }
694            }
695        }
696
697        if self.peek().1 != '"' {
698            return Err(self.source.error(line, col, "unmatched \""));
699        }
700
701        self.iter.next();
702        let end = self.peek().0;
703        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
704
705        if start == 0 || end <= start {
706            // Reject invalid spans before slicing/serde to avoid panic
707            return Err(self.source.error(line, col, "invalid string span"));
708        }
709
710        let str_slice = self
711            .source
712            .contents()
713            .get(start.saturating_sub(1)..end)
714            .ok_or_else(|| self.source.error(line, col, "invalid string span"))?;
715
716        // Ensure that the string is parsable in Rust.
717        match serde_json::from_str::<String>(str_slice) {
718            Ok(_) => (),
719            Err(e) => {
720                let serde_msg = &e.to_string();
721                let msg = serde_msg;
722                bail!(
723                    "{} {}",
724                    self.source
725                        .error(self.line, col, "serde_json cannot parse string:"),
726                    msg
727                )
728            }
729        }
730
731        check_memory_limit()?;
732
733        Ok(Token(
734            TokenKind::String,
735            Span {
736                source: self.source.clone(),
737                line,
738                col: col.saturating_add(1),
739                start: usize_to_u32(start)?,
740                end: usize_to_u32(end)?.saturating_sub(1),
741            },
742        ))
743    }
744
745    #[cfg(feature = "azure-rbac")]
746    fn read_single_quoted_string(&mut self) -> Result<Token> {
747        let (line, col) = (self.line, self.col);
748        self.iter.next();
749        self.advance_col(1)?;
750        let (start, _) = self.peek();
751        loop {
752            let (offset, ch) = self.peek();
753            let cursor_col = self
754                .col
755                .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
756            match ch {
757                '\'' | '\x00' => {
758                    break;
759                }
760                '\\' => {
761                    self.iter.next();
762                    let (_, escape_ch) = self.peek();
763                    self.iter.next();
764                    match escape_ch {
765                        // Basic escape sequences for single-quoted strings
766                        '\'' | '\\' | 'n' | 'r' | 't' => (),
767                        _ => {
768                            return Err(self.source.error(
769                                line,
770                                cursor_col,
771                                "invalid escape sequence",
772                            ))
773                        }
774                    }
775                }
776                _ => {
777                    // check for valid chars
778                    let inner_cursor_col = self
779                        .col
780                        .saturating_add(usize_to_u32(offset.saturating_sub(start))?);
781                    if !('\u{0020}'..='\u{10FFFF}').contains(&ch) {
782                        return Err(self.source.error(
783                            line,
784                            inner_cursor_col,
785                            "invalid character in string",
786                        ));
787                    }
788                    self.iter.next();
789                }
790            }
791        }
792
793        if self.peek().1 != '\'' {
794            return Err(self.source.error(line, col, "unmatched '"));
795        }
796
797        self.iter.next();
798        let end = self.peek().0;
799        self.advance_col(usize_to_u32(end.saturating_sub(start))?)?;
800
801        check_memory_limit()?;
802
803        Ok(Token(
804            TokenKind::String,
805            Span {
806                source: self.source.clone(),
807                line,
808                col: col.saturating_add(1),
809                start: usize_to_u32(start)?,
810                end: usize_to_u32(end)?.saturating_sub(1),
811            },
812        ))
813    }
814
815    #[inline]
816    fn skip_past_newline(&mut self) -> Result<()> {
817        self.iter.next();
818        loop {
819            match self.peek().1 {
820                '\n' | '\x00' => break,
821                _ => self.iter.next(),
822            };
823        }
824        Ok(())
825    }
826
827    fn skip_ws(&mut self) -> Result<()> {
828        // Only the 4 json whitespace characters are recognized.
829        // https://www.crockford.com/mckeeman.html.
830        // Additionally, comments are also skipped.
831        // A tab is considered 4 space characters.
832        loop {
833            match self.peek().1 {
834                ' ' => self.advance_col(1)?,
835                '\t' => self.advance_col(4)?,
836                '\r' => {
837                    if self.peekahead(1).1 != '\n' {
838                        return Err(self.source.error(
839                            self.line,
840                            self.col,
841                            "\\r must be followed by \\n",
842                        ));
843                    }
844                }
845                '\n' => {
846                    self.col = 1;
847                    self.advance_line(1)?;
848                }
849                '#' if !self.comment_starts_with_double_slash => {
850                    self.skip_past_newline()?;
851                    continue;
852                }
853                '/' if self.comment_starts_with_double_slash && self.peekahead(1).1 == '/' => {
854                    self.skip_past_newline()?;
855                    continue;
856                }
857                _ => break,
858            }
859            self.iter.next();
860        }
861        Ok(())
862    }
863
864    pub fn next_token(&mut self) -> Result<Token> {
865        self.skip_ws()?;
866
867        let (start, chr) = self.peek();
868        let start_u32 = usize_to_u32(start)?;
869        let col = self.col;
870
871        let token = match chr {
872            // Special case for - followed by digit which is a
873            // negative json number.
874            // . followed by digit is invalid number.
875            '-' | '.' if self.peekahead(1).1.is_ascii_digit() => self.read_number()?,
876            // grouping characters
877            '{' | '}' | '[' | ']' | '(' | ')' |
878            // arith operator
879            '+' | '-' | '*' | '/' | '%' |
880            // separators
881            ',' | ';' | '.' => {
882                self.advance_col(1)?;
883                self.iter.next();
884                Token(TokenKind::Symbol, Span {
885                    source: self.source.clone(),
886                    line: self.line,
887                    col,
888                    start: start_u32,
889                    end: start_u32.saturating_add(1),
890                })
891            }
892            #[cfg(feature = "azure-rbac")]
893            // RBAC logical AND operator (&&)
894            '&' if self.enable_rbac_tokens && self.peekahead(1).1 == '&' => {
895                self.advance_col(2)?;
896                self.iter.next();
897                self.iter.next();
898                Token(TokenKind::AzureRbac(AzureRbacTokenKind::LogicalAnd), Span {
899                    source: self.source.clone(),
900                    line: self.line,
901                    col,
902                    start: start_u32,
903                    end: start_u32.saturating_add(2),
904                })
905            }
906            #[cfg(feature = "azure-rbac")]
907            // RBAC logical OR operator (||)
908            '|' if self.enable_rbac_tokens && self.peekahead(1).1 == '|' => {
909                self.advance_col(2)?;
910                self.iter.next();
911                self.iter.next();
912                Token(TokenKind::AzureRbac(AzureRbacTokenKind::LogicalOr), Span {
913                    source: self.source.clone(),
914                    line: self.line,
915                    col,
916                    start: start_u32,
917                    end: start_u32.saturating_add(2),
918                })
919            }
920            // Generic bin operators (when RBAC tokens not enabled or single & |)
921            '&' | '|' => {
922                self.advance_col(1)?;
923                self.iter.next();
924                Token(TokenKind::Symbol, Span {
925                    source: self.source.clone(),
926                    line: self.line,
927                    col,
928                    start: start_u32,
929                    end: start_u32.saturating_add(1),
930                })
931            }
932            ':' => {
933                self.advance_col(1)?;
934                self.iter.next();
935                let mut end = start_u32.saturating_add(1);
936                if self.peek().1 == '=' || (self.peek().1 == ':' && self.double_colon_token) {
937                    self.advance_col(1)?;
938                    self.iter.next();
939                    end = end.saturating_add(1);
940                }
941                Token(TokenKind::Symbol, Span {
942                    source: self.source.clone(),
943                    line: self.line,
944                    col,
945                    start: start_u32,
946                    end,
947                })
948            }
949            // < <= > >= = ==
950            '<' | '>' | '=' => {
951                self.advance_col(1)?;
952                self.iter.next();
953                if self.peek().1 == '=' {
954                    self.advance_col(1)?;
955                    self.iter.next();
956                };
957                Token(TokenKind::Symbol, Span {
958                    source: self.source.clone(),
959                    line: self.line,
960                    col,
961                    start: start_u32,
962                    end: usize_to_u32(self.peek().0)?,
963                })
964            }
965            '!' if self.peekahead(1).1 == '=' => {
966                self.advance_col(2)?;
967                self.iter.next();
968                self.iter.next();
969                Token(TokenKind::Symbol, Span {
970                    source: self.source.clone(),
971                    line: self.line,
972                    col,
973                    start: start_u32,
974                    end: usize_to_u32(self.peek().0)?,
975                })
976            }
977            #[cfg(feature = "azure-rbac")]
978            // RBAC @ token for attribute references
979            '@' if self.enable_rbac_tokens => {
980                self.advance_col(1)?;
981                self.iter.next();
982                Token(TokenKind::AzureRbac(AzureRbacTokenKind::At), Span {
983                    source: self.source.clone(),
984                    line: self.line,
985                    col,
986                    start: start_u32,
987                    end: start_u32.saturating_add(1),
988                })
989            }
990            '"' => self.read_string()?,
991            #[cfg(feature = "azure-rbac")]
992            '\'' if self.allow_single_quoted_strings => self.read_single_quoted_string()?,
993            '`' => self.read_raw_string()?,
994            '\x00' => Token(TokenKind::Eof, Span {
995                source: self.source.clone(),
996                line: self.line,
997                col,
998                start: start_u32,
999                end: start_u32,
1000            }),
1001            _ if chr.is_ascii_digit() => self.read_number()?,
1002            _ if chr.is_ascii_alphabetic() || chr == '_' => {
1003                let mut ident = self.read_ident()?;
1004                if ident.1.text() == "set" && self.peek().1 == '(' {
1005                    // set immediately followed by ( is treated as set( if
1006                    // the next token is ).
1007                    let state = (self.iter.clone(), self.line, self.col);
1008                    self.iter.next();
1009
1010                    // Check it next token is ).
1011                    let next_tok = self.next_token()?;
1012                    let is_setp = next_tok.1.text() == ")";
1013
1014                    // Restore state
1015                    (self.iter, self.line, self.col) = state;
1016
1017                    if is_setp {
1018                        self.iter.next();
1019                        self.advance_col(1)?;
1020                        ident.1.end = ident.1.end.saturating_add(1);
1021                    }
1022                }
1023                ident
1024            }
1025            _ if self.unknown_char_is_symbol => {
1026                self.advance_col(1)?;
1027                self.iter.next();
1028                Token(TokenKind::Symbol, Span {
1029                    source: self.source.clone(),
1030                    line: self.line,
1031                    col,
1032                    start: start_u32,
1033                    end: start_u32.saturating_add(1),
1034                })
1035            }
1036            _ => return Err(self.source.error(self.line, self.col, "invalid character")),
1037        };
1038
1039        check_memory_limit()?;
1040        Ok(token)
1041    }
1042}