This commit is contained in:
meteorgan 2025-07-07 17:57:01 +07:00 committed by GitHub
commit 4faacc2066
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 76 additions and 103 deletions

View file

@ -68,7 +68,14 @@ pub mod offset {
/// The number of cells in the page (u16).
pub const BTREE_CELL_COUNT: usize = 3;
/// A pointer to first byte of cell allocated content from top (u16).
/// A pointer to the first byte of cell allocated content from top (u16).
///
/// A zero value for this integer is interpreted as 65,536.
/// If a page contains no cells (which is only possible for a root page of a table that
/// contains no rows) then the offset to the cell content area will equal the page size minus
/// the bytes of reserved space. If the database uses a 65536-byte page size and the
/// reserved space is zero (the usual value for reserved space) then the cell content offset of
/// an empty page wants to be 6,5536
///
/// SQLite strives to place cells as far toward the end of the b-tree page as it can, in
/// order to leave space for future growth of the cell pointer array. This means that the
@ -2214,10 +2221,10 @@ impl BTreeCursor {
cell_idx,
self.usable_space() as u16,
)?;
contents.overflow_cells.len()
!contents.overflow_cells.is_empty()
};
self.stack.set_cell_index(cell_idx as i32);
if overflow > 0 {
if overflow {
// A balance will happen so save the key we were inserting
tracing::debug!(page = page.get().get().id, cell_idx, "balance triggered:");
self.save_context(match bkey {
@ -3870,15 +3877,12 @@ impl BTreeCursor {
while self.find_cell_state.get_cell_idx() < cell_count as isize {
assert!(self.find_cell_state.get_cell_idx() >= 0);
let cell_idx = self.find_cell_state.get_cell_idx() as usize;
match page
.cell_get(
cell_idx,
payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16),
payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16),
self.usable_space(),
)
.unwrap()
{
match page.cell_get(
cell_idx,
payload_overflow_threshold_max(page.page_type(), self.usable_space() as u16),
payload_overflow_threshold_min(page.page_type(), self.usable_space() as u16),
self.usable_space(),
)? {
BTreeCell::TableLeafCell(cell) => {
if key.to_rowid() <= cell._rowid {
break;
@ -4259,13 +4263,10 @@ impl BTreeCursor {
page.get().get_contents().page_type(),
PageType::TableLeaf | PageType::TableInterior
) {
let _target_rowid = match return_if_io!(self.rowid()) {
Some(rowid) => rowid,
_ => {
self.state = CursorState::None;
return Ok(CursorResult::Ok(()));
}
};
if return_if_io!(self.rowid()).is_none() {
self.state = CursorState::None;
return Ok(CursorResult::Ok(()));
}
} else if self.reusable_immutable_record.borrow().is_none() {
self.state = CursorState::None;
return Ok(CursorResult::Ok(()));
@ -4374,8 +4375,6 @@ impl BTreeCursor {
let page = page.get();
let contents = page.get_contents();
let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1);
let delete_info = self.state.mut_delete_info().unwrap();
if !contents.is_leaf() {
delete_info.state = DeleteState::InteriorNodeReplacement {
@ -4384,7 +4383,7 @@ impl BTreeCursor {
post_balancing_seek_key,
};
} else {
let contents = page.get().contents.as_mut().unwrap();
let is_last_cell = cell_idx == contents.cell_count().saturating_sub(1);
drop_cell(contents, cell_idx, self.usable_space() as u16)?;
let delete_info = self.state.mut_delete_info().unwrap();
@ -6068,8 +6067,8 @@ fn free_cell_range(
pc
};
if offset <= page.cell_content_area() {
if offset < page.cell_content_area() {
if (offset as u32) <= page.cell_content_area() {
if (offset as u32) < page.cell_content_area() {
return_corrupt!("Free block before content area");
}
if pointer_to_pc != page.offset as u16 + offset::BTREE_FIRST_FREEBLOCK as u16 {
@ -6244,8 +6243,13 @@ fn insert_into_cell(
Ok(())
}
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected to be between first cell byte
/// and end of cell pointer area.
/// The amount of free space is the sum of:
/// #1. The size of the unallocated region
/// #2. Fragments (isolated 1-3 byte chunks of free space within the cell content area)
/// #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that
/// are not in use due to e.g. deletions)
/// Free blocks can be zero, meaning the "real free space" that can be used to allocate is expected
/// to be between first cell byte and end of cell pointer area.
#[allow(unused_assignments)]
fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
// TODO(pere): maybe free space is not calculated correctly with offset
@ -6254,38 +6258,14 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
// space that is not reserved for extensions by sqlite. Usually reserved_space is 0.
let usable_space = usable_space as usize;
let mut cell_content_area_start = page.cell_content_area();
// A zero value for the cell content area pointer is interpreted as 65536.
// See https://www.sqlite.org/fileformat.html
// The max page size for a sqlite database is 64kiB i.e. 65536 bytes.
// 65536 is u16::MAX + 1, and since cell content grows from right to left, this means
// the cell content area pointer is at the end of the page,
// i.e.
// 1. the page size is 64kiB
// 2. there are no cells on the page
// 3. there is no reserved space at the end of the page
if cell_content_area_start == 0 {
cell_content_area_start = u16::MAX;
}
// The amount of free space is the sum of:
// #1. the size of the unallocated region
// #2. fragments (isolated 1-3 byte chunks of free space within the cell content area)
// #3. freeblocks (linked list of blocks of at least 4 bytes within the cell content area that are not in use due to e.g. deletions)
let pointer_size = if matches!(page.page_type(), PageType::TableLeaf | PageType::IndexLeaf) {
0
} else {
4
};
let first_cell = page.offset + 8 + pointer_size + (2 * page.cell_count());
let mut free_space_bytes =
cell_content_area_start as usize + page.num_frag_free_bytes() as usize;
let first_cell = page.offset + page.header_size() + (2 * page.cell_count());
let cell_content_area_start = page.cell_content_area() as usize;
let mut free_space_bytes = cell_content_area_start + page.num_frag_free_bytes() as usize;
// #3 is computed by iterating over the freeblocks linked list
let mut cur_freeblock_ptr = page.first_freeblock() as usize;
if cur_freeblock_ptr > 0 {
if cur_freeblock_ptr < cell_content_area_start as usize {
if cur_freeblock_ptr < cell_content_area_start {
// Freeblocks exist in the cell content area e.g. after deletions
// They should never exist in the unused area of the page.
todo!("corrupted page");
@ -6299,7 +6279,7 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
size = page.read_u16_no_offset(cur_freeblock_ptr + 2) as usize; // next 2 bytes in freeblock = size of current freeblock
free_space_bytes += size;
// Freeblocks are in order from left to right on the page,
// so next pointer should > current pointer + its size, or 0 if no next block exists.
// so the next pointer should > current pointer + its size, or 0 if no next block exists.
if next <= cur_freeblock_ptr + size + 3 {
break;
}
@ -6307,8 +6287,8 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
}
// Next should always be 0 (NULL) at this point since we have reached the end of the freeblocks linked list
assert!(
next == 0,
assert_eq!(
next, 0,
"corrupted page: freeblocks list not in ascending order"
);
@ -6323,10 +6303,6 @@ fn compute_free_space(page: &PageContent, usable_space: u16) -> u16 {
"corrupted page: free space is greater than usable space"
);
// if( nFree>usableSize || nFree<iCellFirst ){
// return SQLITE_CORRUPT_PAGE(pPage);
// }
free_space_bytes as u16 - first_cell as u16
}
@ -6417,7 +6393,6 @@ fn fill_cell_payload(
cell_payload.resize(prev_size + space_left + 4, 0);
let mut pointer = unsafe { cell_payload.as_mut_ptr().add(prev_size) };
let mut pointer_to_next = unsafe { cell_payload.as_mut_ptr().add(prev_size + space_left) };
let mut overflow_pages = Vec::new();
loop {
let to_copy = space_left.min(to_copy_buffer.len());
@ -6431,7 +6406,6 @@ fn fill_cell_payload(
// we still have bytes to add, we will need to allocate new overflow page
// FIXME: handle page cache is full
let overflow_page = pager.allocate_overflow_page();
overflow_pages.push(overflow_page.clone());
{
let id = overflow_page.get().id as u32;
let contents = overflow_page.get().contents.as_mut().unwrap();

View file

@ -45,11 +45,17 @@
use tracing::{instrument, Level};
use super::pager::PageRef;
use super::wal::LimboRwLock;
use crate::error::LimboError;
use crate::fast_lock::SpinLock;
use crate::io::{
Buffer, Complete, Completion, CompletionType, ReadCompletion, SyncCompletion, WriteCompletion,
};
use crate::storage::btree::offset::{
BTREE_CELL_CONTENT_AREA, BTREE_CELL_COUNT, BTREE_FIRST_FREEBLOCK, BTREE_FRAGMENTED_BYTES_COUNT,
BTREE_PAGE_TYPE, BTREE_RIGHTMOST_PTR,
};
use crate::storage::buffer_pool::BufferPool;
use crate::storage::database::DatabaseStorage;
use crate::storage::pager::Pager;
@ -65,9 +71,6 @@ use std::rc::Rc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
use super::pager::PageRef;
use super::wal::LimboRwLock;
/// The size of the database header in bytes.
pub const DATABASE_HEADER_SIZE: usize = 100;
// DEFAULT_CACHE_SIZE negative values mean that we store the amount of pages a XKiB of memory can hold.
@ -357,6 +360,8 @@ pub struct OverflowCell {
#[derive(Debug)]
pub struct PageContent {
/// the position where page content starts. it's 100 for page 1(database file header is 100 bytes),
/// 0 for all other pages.
pub offset: usize,
pub buffer: Arc<RefCell<Buffer>>,
pub overflow_cells: Vec<OverflowCell>,
@ -373,6 +378,7 @@ impl Clone for PageContent {
}
}
const CELL_POINTER_SIZE_BYTES: usize = 2;
impl PageContent {
pub fn new(offset: usize, buffer: Arc<RefCell<Buffer>>) -> Self {
Self {
@ -383,7 +389,7 @@ impl PageContent {
}
pub fn page_type(&self) -> PageType {
self.read_u8(0).try_into().unwrap()
self.read_u8(BTREE_PAGE_TYPE).try_into().unwrap()
}
pub fn maybe_page_type(&self) -> Option<PageType> {
@ -452,19 +458,14 @@ impl PageContent {
buf[self.offset + pos..self.offset + pos + 4].copy_from_slice(&value.to_be_bytes());
}
/// The second field of the b-tree page header is the offset of the first freeblock, or zero if there are no freeblocks on the page.
/// A freeblock is a structure used to identify unallocated space within a b-tree page.
/// Freeblocks are organized as a chain.
///
/// To be clear, freeblocks do not mean the regular unallocated free space to the left of the cell content area pointer, but instead
/// blocks of at least 4 bytes WITHIN the cell content area that are not in use due to e.g. deletions.
/// The offset of the first freeblock, or zero if there are no freeblocks on the page.
pub fn first_freeblock(&self) -> u16 {
self.read_u16(1)
self.read_u16(BTREE_FIRST_FREEBLOCK)
}
/// The number of cells on the page.
pub fn cell_count(&self) -> usize {
self.read_u16(3) as usize
self.read_u16(BTREE_CELL_COUNT) as usize
}
/// The size of the cell pointer array in bytes.
@ -486,11 +487,13 @@ impl PageContent {
}
/// The start of the cell content area.
/// SQLite strives to place cells as far toward the end of the b-tree page as it can,
/// in order to leave space for future growth of the cell pointer array.
/// = the cell content area pointer moves leftward as cells are added to the page
pub fn cell_content_area(&self) -> u16 {
self.read_u16(5)
pub fn cell_content_area(&self) -> u32 {
let offset = self.read_u16(BTREE_CELL_CONTENT_AREA);
if offset == 0 {
MAX_PAGE_SIZE
} else {
offset as u32
}
}
/// The size of the page header in bytes.
@ -504,16 +507,15 @@ impl PageContent {
}
}
/// The total number of bytes in all fragments is stored in the fifth field of the b-tree page header.
/// Fragments are isolated groups of 1, 2, or 3 unused bytes within the cell content area.
/// The total number of bytes in all fragments
pub fn num_frag_free_bytes(&self) -> u8 {
self.read_u8(7)
self.read_u8(BTREE_FRAGMENTED_BYTES_COUNT)
}
pub fn rightmost_pointer(&self) -> Option<u32> {
match self.page_type() {
PageType::IndexInterior => Some(self.read_u32(8)),
PageType::TableInterior => Some(self.read_u32(8)),
PageType::IndexInterior => Some(self.read_u32(BTREE_RIGHTMOST_PTR)),
PageType::TableInterior => Some(self.read_u32(BTREE_RIGHTMOST_PTR)),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
}
@ -521,9 +523,11 @@ impl PageContent {
pub fn rightmost_pointer_raw(&self) -> Option<*mut u8> {
match self.page_type() {
PageType::IndexInterior | PageType::TableInterior => {
Some(unsafe { self.as_ptr().as_mut_ptr().add(self.offset + 8) })
}
PageType::IndexInterior | PageType::TableInterior => Some(unsafe {
self.as_ptr()
.as_mut_ptr()
.add(self.offset + BTREE_RIGHTMOST_PTR)
}),
PageType::IndexLeaf => None,
PageType::TableLeaf => None,
}
@ -540,16 +544,14 @@ impl PageContent {
let buf = self.as_ptr();
let ncells = self.cell_count();
// the page header is 12 bytes for interior pages, 8 bytes for leaf pages
// this is because the 4 last bytes in the interior page's header are used for the rightmost pointer.
let cell_pointer_array_start = self.header_size();
assert!(
idx < ncells,
"cell_get: idx out of bounds: idx={}, ncells={}",
idx,
ncells
);
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
// SAFETY: this buffer is valid as long as the page is alive. We could store the page in the cell and do some lifetime magic
@ -570,9 +572,8 @@ impl PageContent {
pub fn cell_table_interior_read_rowid(&self, idx: usize) -> Result<i64> {
debug_assert!(self.page_type() == PageType::TableInterior);
let buf = self.as_ptr();
const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12;
let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
const LEFT_CHILD_PAGE_SIZE_BYTES: usize = 4;
let (rowid, _) = read_varint(&buf[cell_pointer + LEFT_CHILD_PAGE_SIZE_BYTES..])?;
@ -584,9 +585,8 @@ impl PageContent {
pub fn cell_table_interior_read_left_child_page(&self, idx: usize) -> Result<u32> {
debug_assert!(self.page_type() == PageType::TableInterior);
let buf = self.as_ptr();
const INTERIOR_PAGE_HEADER_SIZE_BYTES: usize = 12;
let cell_pointer_array_start = INTERIOR_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
Ok(u32::from_be_bytes([
buf[cell_pointer],
@ -601,9 +601,8 @@ impl PageContent {
pub fn cell_table_leaf_read_rowid(&self, idx: usize) -> Result<i64> {
debug_assert!(self.page_type() == PageType::TableLeaf);
let buf = self.as_ptr();
const LEAF_PAGE_HEADER_SIZE_BYTES: usize = 8;
let cell_pointer_array_start = LEAF_PAGE_HEADER_SIZE_BYTES;
let cell_pointer = cell_pointer_array_start + (idx * 2);
let cell_pointer_array_start = self.header_size();
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16(cell_pointer) as usize;
let mut pos = cell_pointer;
let (_, nr) = read_varint(&buf[pos..])?;
@ -623,7 +622,7 @@ impl PageContent {
(self.offset + header_size, self.cell_pointer_array_size())
}
/// Get region of a cell's payload
/// Get region(start end length) of a cell's payload
pub fn cell_get_raw_region(
&self,
idx: usize,
@ -635,7 +634,7 @@ impl PageContent {
let ncells = self.cell_count();
let (cell_pointer_array_start, _) = self.cell_pointer_array_offset_and_size();
assert!(idx < ncells, "cell_get: idx out of bounds");
let cell_pointer = cell_pointer_array_start + (idx * 2); // pointers are 2 bytes each
let cell_pointer = cell_pointer_array_start + (idx * CELL_POINTER_SIZE_BYTES);
let cell_pointer = self.read_u16_no_offset(cell_pointer) as usize;
let start = cell_pointer;
let len = match self.page_type() {