Add follow json path in jsonb

2025-08-04 18:18:03 +00:00 · 2025-03-15 22:00:12 +02:00 · 2025-03-15 22:00:12 +02:00 · 353422a25c
commit 353422a25c
parent 9efcacc601
1 changed files with 246 additions and 5 deletions
--- a/core/json/jsonb.rs
+++ b/core/json/jsonb.rs
@ -1,6 +1,8 @@
 use crate::{bail_parse_error, LimboError, Result};
 use std::{fmt::Write, str::from_utf8};

+use super::json_path::{JsonPath, PathElement};
+
 const SIZE_MARKER_8BIT: u8 = 12;
 const SIZE_MARKER_16BIT: u8 = 13;
 const SIZE_MARKER_32BIT: u8 = 14;
@ -218,7 +220,7 @@ impl TryFrom<u8> for ElementType {

 type PayloadSize = usize;

-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct JsonbHeader(ElementType, PayloadSize);

 enum HeaderFormat {
@ -402,7 +404,7 @@ impl Jsonb {
            | JsonbHeader(ElementType::TEXTRAW, len)
            | JsonbHeader(ElementType::TEXTJ, len)
            | JsonbHeader(ElementType::TEXT5, len) => {
-                self.serialize_string(string, cursor, len, &header.0)?
+                self.serialize_string(string, cursor, len, &header.0, true)?
            }
            JsonbHeader(ElementType::INT, len)
            | JsonbHeader(ElementType::INT5, len)
@ -436,7 +438,7 @@ impl Jsonb {
                | ElementType::TEXTJ
                | ElementType::TEXT5 => {
                    current_cursor =
-                        self.serialize_string(string, current_cursor, len, &element_type)?;
+                        self.serialize_string(string, current_cursor, len, &element_type, true)?;
                }
                _ => bail_parse_error!("malformed JSON"),
            }
@ -474,9 +476,13 @@ impl Jsonb {
        cursor: usize,
        len: usize,
        kind: &ElementType,
+        quote: bool,
    ) -> Result<usize> {
        let word_slice = &self.data[cursor..cursor + len];
-        string.push('"');
+        if quote {
+            string.push('"');
+        }
+
        match kind {
            // Can be serialized as is. Do not need escaping
            ElementType::TEXT => {
@ -630,7 +636,10 @@ impl Jsonb {
                unreachable!()
            }
        }
-        string.push('"');
+        if quote {
+            string.push('"');
+        }
+
        Ok(cursor + len)
    }

@ -1367,9 +1376,122 @@ impl Jsonb {
        Ok(result)
    }

+    pub fn from_raw_data(data: &[u8]) -> Self {
+        Self::new(data.len(), Some(data))
+    }
+
    pub fn data(self) -> Vec<u8> {
        self.data
    }
+
+    pub fn get_by_path(&self, path: &JsonPath) -> Result<(Jsonb, ElementType)> {
+        let mut pos = 0;
+        let mut string_buffer = String::with_capacity(1024);
+        for segment in path.elements.iter() {
+            pos = self.navigate_to_segment(segment, pos, &mut string_buffer)?;
+        }
+        let (header, skip_header) = self.read_header(pos)?;
+        let end = pos + skip_header + header.1;
+        Ok((Jsonb::from_raw_data(&self.data[pos..end]), header.0))
+    }
+
+    fn navigate_to_segment(
+        &self,
+        segment: &PathElement,
+        pos: usize,
+        string_buffer: &mut String,
+    ) -> Result<usize> {
+        let (header, skip_header) = self.read_header(pos)?;
+        let (parent_type, parent_size) = (header.0, header.1);
+        let mut current_pos = pos + skip_header;
+
+        match segment {
+            PathElement::Root() => return Ok(0),
+            PathElement::Key(path_key, is_raw) => {
+                if parent_type != ElementType::OBJECT {
+                    bail_parse_error!("parent is not object")
+                };
+                while current_pos < pos + parent_size {
+                    let (key_header, skip_header) = self.read_header(current_pos)?;
+                    let (key_type, key_size) = (key_header.0, key_header.1);
+                    current_pos += skip_header;
+
+                    if matches!(
+                        key_header.0,
+                        ElementType::TEXT
+                            | ElementType::TEXT5
+                            | ElementType::TEXTJ
+                            | ElementType::TEXTRAW
+                    ) {
+                        string_buffer.clear();
+                        current_pos = self.serialize_string(
+                            string_buffer,
+                            current_pos,
+                            key_size,
+                            &key_type,
+                            false,
+                        )?;
+
+                        if compare((&string_buffer, key_type), (path_key, *is_raw)) {
+                            return Ok(current_pos);
+                        } else {
+                            current_pos = self.skip_element(current_pos)?;
+                        }
+                    } else {
+                        bail_parse_error!("Key is not text!")
+                    };
+                }
+            }
+            PathElement::ArrayLocator(idx) => {
+                if parent_type != ElementType::ARRAY {
+                    bail_parse_error!("parent is not array");
+                };
+                if let Some(id) = idx {
+                    let id = id.to_owned();
+                    if id > 0 {
+                        for _ in 0..id as usize {
+                            if current_pos < pos + parent_size {
+                                current_pos = self.skip_element(current_pos)?;
+                            } else {
+                                bail_parse_error!("Index is bigger then array size");
+                            }
+                        }
+                        return Ok(current_pos);
+                        // fix this after we remove serialized json
+                    } else {
+                        let mut temp_pos = current_pos;
+                        let mut count_elements = 0;
+                        while temp_pos < pos + parent_size {
+                            temp_pos = self.skip_element(temp_pos)?;
+                            count_elements += 1;
+                        }
+                        let corrected_idx = count_elements + id;
+                        if corrected_idx < 0 {
+                            bail_parse_error!("Index is bigger then array size")
+                        }
+                        for _ in 0..corrected_idx as usize {
+                            if current_pos < pos + parent_size {
+                                current_pos = self.skip_element(current_pos)?;
+                            } else {
+                                bail_parse_error!("Index is bigger then array size");
+                            }
+                        }
+                        return Ok(current_pos);
+                    }
+                } else {
+                    return Ok(pos);
+                }
+            }
+        }
+
+        bail_parse_error!("Did not find anything")
+    }
+
+    fn skip_element(&self, mut pos: usize) -> Result<usize> {
+        let (header, skip_header) = self.read_header(pos)?;
+        pos += skip_header + header.1;
+        Ok(pos)
+    }
 }

 impl std::str::FromStr for Jsonb {
@ -1380,6 +1502,125 @@ impl std::str::FromStr for Jsonb {
    }
 }

+#[inline]
+fn compare(key: (&str, ElementType), path_key: (&str, bool)) -> bool {
+    let (key, element_type) = key;
+    let (path_key, is_raw) = path_key;
+    if !is_raw && element_type == ElementType::TEXT {
+        if key.len() == path_key.len() {
+            return key == path_key;
+        } else {
+            return false;
+        }
+    }
+    if !is_raw {
+        return unescape_string(key) == path_key;
+    }
+    match element_type {
+        ElementType::TEXTJ | ElementType::TEXT5 | ElementType::TEXTRAW | ElementType::TEXT => {
+            return unescape_string(key) == unescape_string(path_key);
+        }
+        _ => {}
+    };
+
+    return false;
+}
+
+#[inline]
+pub fn unescape_string(input: &str) -> String {
+    let mut result = String::with_capacity(input.len());
+    let mut chars = input.chars().peekable();
+
+    while let Some(c) = chars.next() {
+        if c == '\\' {
+            match chars.next() {
+                Some('n') => result.push('\n'),
+                Some('r') => result.push('\r'),
+                Some('t') => result.push('\t'),
+                Some('\\') => result.push('\\'),
+                Some('/') => result.push('/'),
+                Some('"') => result.push('"'),
+                Some('b') => result.push('\u{0008}'),
+                Some('f') => result.push('\u{000C}'),
+
+                // Handle \uXXXX format (JSON style)
+                Some('u') => {
+                    let mut code_point = String::new();
+                    for _ in 0..4 {
+                        if let Some(hex_char) = chars.next() {
+                            code_point.push(hex_char);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if let Ok(code) = u16::from_str_radix(&code_point, 16) {
+                        // Check if this is a high surrogate
+                        if (0xD800..=0xDBFF).contains(&code) {
+                            if chars.next() == Some('\\') && chars.next() == Some('u') {
+                                let mut low_surrogate = String::new();
+                                for _ in 0..4 {
+                                    if let Some(hex_char) = chars.next() {
+                                        low_surrogate.push(hex_char);
+                                    } else {
+                                        break;
+                                    }
+                                }
+
+                                if let Ok(low_code) = u16::from_str_radix(&low_surrogate, 16) {
+                                    if (0xDC00..=0xDFFF).contains(&low_code) {
+                                        let high_ten_bits = (code - 0xD800) as u32;
+                                        let low_ten_bits = (low_code - 0xDC00) as u32;
+                                        let code_point = (high_ten_bits << 10) | low_ten_bits;
+                                        let unicode_value = code_point + 0x10000;
+
+                                        if let Some(unicode_char) = char::from_u32(unicode_value) {
+                                            result.push(unicode_char);
+                                        }
+                                    } else {
+                                        // If low surrogate is invalid, just push both as separate chars
+                                        if let Some(c1) = char::from_u32(code as u32) {
+                                            result.push(c1);
+                                        }
+                                        if let Some(c2) = char::from_u32(low_code as u32) {
+                                            result.push(c2);
+                                        }
+                                    }
+                                }
+                            } else {
+                                // No low surrogate, just push the high surrogate as is
+                                if let Some(unicode_char) = char::from_u32(code as u32) {
+                                    result.push(unicode_char);
+                                }
+                            }
+                        } else {
+                            // Not a surrogate pair, just a regular Unicode character
+                            if let Some(unicode_char) = char::from_u32(code as u32) {
+                                result.push(unicode_char);
+                            }
+                        }
+                    }
+                }
+
+                Some(c) => {
+                    // For any other escape sequence we don't recognize,
+                    // just output the backslash and the character
+                    result.push('\\');
+                    result.push(c);
+                }
+                None => {
+                    // Handle trailing backslash
+                    result.push('\\');
+                }
+            }
+        } else {
+            result.push(c);
+        }
+    }
+
+    result
+}
+
 #[inline]
 pub fn skip_whitespace(input: &[u8], mut pos: usize) -> usize {
    let len = input.len();