Merge 'Support backwards index scan and seeks + utilize indexes in removing ORDER BY' from Jussi Saurio

## Main stuff
- Support iterating an index backwards
- Support scanning an index (instead of seeking with a condition)
- Support backwards index seeks
- Support backwards rowid seeks
- Fix existing backwards iteration logic for table btrees
- Remove ORDER BY entirely if any index satisfies the ordering
- Add fuzz tests for rowid seeks, 1 and 2 column index seeks
## Bytecode examples (note the lack of order by sorting):
one column index order by, forwards:
```sql
limbo> explain select first_name from users order by age;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     13    0                    0   Start at 13
1     OpenReadAsync      0     2     0                    0   table=users, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     274   0                    0   table=age_idx, root=274
4     OpenReadAwait      0     0     0                    0
5     RewindAsync        1     0     0                    0
6     RewindAwait        1     12    0                    0   Rewind table age_idx
7       DeferredSeek     1     0     0                    0
8       Column           0     1     1                    0   r[1]=users.first_name
9       ResultRow        1     1     0                    0   output=r[1]
10    NextAsync          1     0     0                    0
11    NextAwait          1     7     0                    0
12    Halt               0     0     0                    0
13    Transaction        0     0     0                    0   write=false
14    Goto               0     1     0                    0
```
one column index order by, backwards:
```sql
limbo> explain select first_name from users order by age desc;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     13    0                    0   Start at 13
1     OpenReadAsync      0     2     0                    0   table=users, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     274   0                    0   table=age_idx, root=274
4     OpenReadAwait      0     0     0                    0
5     LastAsync          1     0     0                    0
6     LastAwait          1     0     0                    0
7       DeferredSeek     1     0     0                    0
8       Column           0     1     1                    0   r[1]=users.first_name
9       ResultRow        1     1     0                    0   output=r[1]
10    PrevAsync          1     0     0                    0
11    PrevAwait          1     0     0                    0
12    Halt               0     0     0                    0
13    Transaction        0     0     0                    0   write=false
14    Goto               0     1     0                    0
```
rowid seek, backwards:
```sql
limbo> explain select * from users where id < 100 order by id desc;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     19    0                    0   Start at 19
1     OpenReadAsync      0     2     0                    0   table=users, root=2
2     OpenReadAwait      0     0     0                    0
3     Integer            100   11    0                    0   r[11]=100
4     SeekLT             0     18    11                   0
5       RowId            0     1     0                    0   r[1]=users.rowid
6       Column           0     1     2                    0   r[2]=users.first_name
7       Column           0     2     3                    0   r[3]=users.last_name
8       Column           0     3     4                    0   r[4]=users.email
9       Column           0     4     5                    0   r[5]=users.phone_number
10      Column           0     5     6                    0   r[6]=users.address
11      Column           0     6     7                    0   r[7]=users.city
12      Column           0     7     8                    0   r[8]=users.state
13      Column           0     8     9                    0   r[9]=users.zipcode
14      Column           0     9     10                   0   r[10]=users.age
15      ResultRow        1     10    0                    0   output=r[1..10]
16    PrevAsync          0     0     0                    0
17    PrevAwait          0     0     0                    0
18    Halt               0     0     0                    0
19    Transaction        0     0     0                    0   write=false
20    Goto               0     1     0                    0
```
two column order by, setup:
```sql
cargo run dualpk.db

Limbo v0.0.18-pre.3
Enter ".help" for usage hints.
limbo> .schema
CREATE TABLE a(b,c,d,e, primary key (d,c));
```
two column order by, forwards:
```sql
limbo> explain select * from a order by d,c;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     16    0                    0   Start at 16
1     OpenReadAsync      0     2     0                    0   table=a, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     3     0                    0   table=sqlite_autoindex_a_1, root=3
4     OpenReadAwait      0     0     0                    0
5     RewindAsync        1     0     0                    0
6     RewindAwait        1     15    0                    0   Rewind table sqlite_autoindex_a_1
7       DeferredSeek     1     0     0                    0
8       Column           0     0     1                    0   r[1]=a.b
9       Column           0     1     2                    0   r[2]=a.c
10      Column           0     2     3                    0   r[3]=a.d
11      Column           0     3     4                    0   r[4]=a.e
12      ResultRow        1     4     0                    0   output=r[1..4]
13    NextAsync          1     0     0                    0
14    NextAwait          1     7     0                    0
15    Halt               0     0     0                    0
16    Transaction        0     0     0                    0   write=false
17    Goto               0     1     0                    0
```
two column order by, forwards with index seek:
```sql
limbo> explain select * from a where d > 100 order by d,c;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     16    0                    0   Start at 16
1     OpenReadAsync      0     2     0                    0   table=a, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     3     0                    0   table=sqlite_autoindex_a_1, root=3
4     OpenReadAwait      0     0     0                    0
5     Integer            100   5     0                    0   r[5]=100
6     SeekGT             1     15    5                    0
7       DeferredSeek     1     0     0                    0
8       Column           0     0     1                    0   r[1]=a.b
9       Column           0     1     2                    0   r[2]=a.c
10      Column           0     2     3                    0   r[3]=a.d
11      Column           0     3     4                    0   r[4]=a.e
12      ResultRow        1     4     0                    0   output=r[1..4]
13    NextAsync          1     0     0                    0
14    NextAwait          1     7     0                    0
15    Halt               0     0     0                    0
16    Transaction        0     0     0                    0   write=false
17    Goto               0     1     0                    0
```
two column order by, forwards with index scan and termination condition:
```sql
limbo> explain select * from a where d < 100 order by d,c;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     18    0                    0   Start at 18
1     OpenReadAsync      0     2     0                    0   table=a, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     3     0                    0   table=sqlite_autoindex_a_1, root=3
4     OpenReadAwait      0     0     0                    0
5     Null               0     5     0                    0   r[5]=NULL
6     SeekGT             1     17    5                    0
7       Integer          100   6     0                    0   r[6]=100
8       IdxGE            1     17    6                    0
9       DeferredSeek     1     0     0                    0
10      Column           0     0     1                    0   r[1]=a.b
11      Column           0     1     2                    0   r[2]=a.c
12      Column           0     2     3                    0   r[3]=a.d
13      Column           0     3     4                    0   r[4]=a.e
14      ResultRow        1     4     0                    0   output=r[1..4]
15    NextAsync          1     0     0                    0
16    NextAwait          1     7     0                    0
17    Halt               0     0     0                    0
18    Transaction        0     0     0                    0   write=false
19    Goto               0     1     0                    0
```
two column order by, backwards:
```sql
limbo> explain select * from a order by d desc,c desc;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     16    0                    0   Start at 16
1     OpenReadAsync      0     2     0                    0   table=a, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     3     0                    0   table=sqlite_autoindex_a_1, root=3
4     OpenReadAwait      0     0     0                    0
5     LastAsync          1     0     0                    0
6     LastAwait          1     0     0                    0
7       DeferredSeek     1     0     0                    0
8       Column           0     0     1                    0   r[1]=a.b
9       Column           0     1     2                    0   r[2]=a.c
10      Column           0     2     3                    0   r[3]=a.d
11      Column           0     3     4                    0   r[4]=a.e
12      ResultRow        1     4     0                    0   output=r[1..4]
13    PrevAsync          1     0     0                    0
14    PrevAwait          1     0     0                    0
15    Halt               0     0     0                    0
16    Transaction        0     0     0                    0   write=false
17    Goto               0     1     0                    0
```
two column order by, backwards with index seek:
```sql
limbo> explain select * from a where d < 100 order by d desc,c desc;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     16    0                    0   Start at 16
1     OpenReadAsync      0     2     0                    0   table=a, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     3     0                    0   table=sqlite_autoindex_a_1, root=3
4     OpenReadAwait      0     0     0                    0
5     Integer            100   5     0                    0   r[5]=100
6     SeekLT             1     15    5                    0
7       DeferredSeek     1     0     0                    0
8       Column           0     0     1                    0   r[1]=a.b
9       Column           0     1     2                    0   r[2]=a.c
10      Column           0     2     3                    0   r[3]=a.d
11      Column           0     3     4                    0   r[4]=a.e
12      ResultRow        1     4     0                    0   output=r[1..4]
13    PrevAsync          1     0     0                    0
14    PrevAwait          1     0     0                    0
15    Halt               0     0     0                    0
16    Transaction        0     0     0                    0   write=false
17    Goto               0     1     0                    0
```
two column order by, backwards with index scan and termination
condition:
```sql
limbo> explain select * from a where d > 100 order by d desc,c desc;
addr  opcode             p1    p2    p3    p4             p5  comment
----  -----------------  ----  ----  ----  -------------  --  -------
0     Init               0     18    0                    0   Start at 18
1     OpenReadAsync      0     2     0                    0   table=a, root=2
2     OpenReadAwait      0     0     0                    0
3     OpenReadAsync      1     3     0                    0   table=sqlite_autoindex_a_1, root=3
4     OpenReadAwait      0     0     0                    0
5     LastAsync          1     0     0                    0
6     LastAwait          1     0     0                    0
7       Integer          100   6     0                    0   r[6]=100
8       IdxLE            1     17    6                    0
9       DeferredSeek     1     0     0                    0
10      Column           0     0     1                    0   r[1]=a.b
11      Column           0     1     2                    0   r[2]=a.c
12      Column           0     2     3                    0   r[3]=a.d
13      Column           0     3     4                    0   r[4]=a.e
14      ResultRow        1     4     0                    0   output=r[1..4]
15    PrevAsync          1     0     0                    0
16    PrevAwait          1     0     0                    0
17    Halt               0     0     0                    0
18    Transaction        0     0     0                    0   write=false
19    Goto               0     1     0                    0
```

Reviewed-by: Preston Thorpe (@PThorpe92)

Closes #1209
This commit is contained in:
Jussi Saurio 2025-04-09 12:03:14 +03:00
commit aa6e2d853a
15 changed files with 1367 additions and 403 deletions

View file

@ -4,6 +4,7 @@ use crate::storage::pager::Pager;
use crate::storage::sqlite3_ondisk::{
read_u32, read_varint, BTreeCell, PageContent, PageType, TableInteriorCell, TableLeafCell,
};
use crate::translate::plan::IterationDirection;
use crate::MvCursor;
use crate::types::{
@ -312,6 +313,17 @@ enum OverflowState {
Done,
}
/// Iteration state of the cursor. Can only be set once.
/// Once a SeekGT or SeekGE is performed, the cursor must iterate forwards and calling prev() is an error.
/// Similarly, once a SeekLT or SeekLE is performed, the cursor must iterate backwards and calling next() is an error.
/// When a SeekEQ or SeekRowid is performed, the cursor is NOT allowed to iterate further.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IterationState {
Unset,
Iterating(IterationDirection),
IterationNotAllowed,
}
pub struct BTreeCursor {
/// The multi-version cursor that is used to read and write to the database file.
mv_cursor: Option<Rc<RefCell<MvCursor>>>,
@ -337,6 +349,8 @@ pub struct BTreeCursor {
/// Reusable immutable record, used to allow better allocation strategy.
reusable_immutable_record: RefCell<Option<ImmutableRecord>>,
empty_record: Cell<bool>,
pub iteration_state: IterationState,
}
/// Stack of pages representing the tree traversal order.
@ -385,6 +399,7 @@ impl BTreeCursor {
},
reusable_immutable_record: RefCell::new(None),
empty_record: Cell::new(true),
iteration_state: IterationState::Unset,
}
}
@ -404,7 +419,10 @@ impl BTreeCursor {
/// Move the cursor to the previous record and return it.
/// Used in backwards iteration.
fn get_prev_record(&mut self) -> Result<CursorResult<Option<u64>>> {
fn get_prev_record(
&mut self,
predicate: Option<(SeekKey<'_>, SeekOp)>,
) -> Result<CursorResult<Option<u64>>> {
loop {
let page = self.stack.top();
let cell_idx = self.stack.current_cell_index();
@ -413,11 +431,11 @@ impl BTreeCursor {
// todo: find a better way to flag moved to end or begin of page
if self.stack.current_cell_index_less_than_min() {
loop {
if self.stack.current_cell_index() > 0 {
self.stack.retreat();
if self.stack.current_cell_index() >= 0 {
break;
}
if self.stack.has_parent() {
self.going_upwards = true;
self.stack.pop();
} else {
// moved to begin of btree
@ -429,11 +447,6 @@ impl BTreeCursor {
}
let cell_idx = cell_idx as usize;
tracing::trace!(
"get_prev_record current id={} cell={}",
page.get().id,
cell_idx
);
return_if_locked!(page);
if !page.is_loaded() {
self.pager.load_page(page.clone())?;
@ -442,13 +455,24 @@ impl BTreeCursor {
let contents = page.get().contents.as_ref().unwrap();
let cell_count = contents.cell_count();
// If we are at the end of the page and we haven't just come back from the right child,
// we now need to move to the rightmost child.
if cell_idx as i32 == i32::MAX && !self.going_upwards {
let rightmost_pointer = contents.rightmost_pointer();
if let Some(rightmost_pointer) = rightmost_pointer {
self.stack
.push_backwards(self.pager.read_page(rightmost_pointer as usize)?);
continue;
}
}
let cell_idx = if cell_idx >= cell_count {
self.stack.set_cell_index(cell_count as i32 - 1);
cell_count - 1
} else {
cell_idx
};
let cell = contents.cell_get(
cell_idx,
payload_overflow_threshold_max(contents.page_type(), self.usable_space() as u16),
@ -462,9 +486,7 @@ impl BTreeCursor {
_rowid,
}) => {
let mem_page = self.pager.read_page(_left_child_page as usize)?;
self.stack.push(mem_page);
// use cell_index = i32::MAX to tell next loop to go to the end of the current page
self.stack.set_cell_index(i32::MAX);
self.stack.push_backwards(mem_page);
continue;
}
BTreeCell::TableLeafCell(TableLeafCell {
@ -484,8 +506,135 @@ impl BTreeCursor {
self.stack.retreat();
return Ok(CursorResult::Ok(Some(_rowid)));
}
BTreeCell::IndexInteriorCell(_) => todo!(),
BTreeCell::IndexLeafCell(_) => todo!(),
BTreeCell::IndexInteriorCell(IndexInteriorCell {
payload,
left_child_page,
first_overflow_page,
payload_size,
}) => {
if !self.going_upwards {
// In backwards iteration, if we haven't just moved to this interior node from the
// right child, but instead are about to move to the left child, we need to retreat
// so that we don't come back to this node again.
// For example:
// this parent: key 666
// left child has: key 663, key 664, key 665
// we need to move to the previous parent (with e.g. key 662) when iterating backwards.
self.stack.retreat();
let mem_page = self.pager.read_page(left_child_page as usize)?;
self.stack.push(mem_page);
// use cell_index = i32::MAX to tell next loop to go to the end of the current page
self.stack.set_cell_index(i32::MAX);
continue;
}
if let Some(next_page) = first_overflow_page {
return_if_io!(self.process_overflow_read(payload, next_page, payload_size))
} else {
crate::storage::sqlite3_ondisk::read_record(
payload,
self.get_immutable_record_or_create().as_mut().unwrap(),
)?
};
// Going upwards = we just moved to an interior cell from the right child.
// On the first pass we must take the record from the interior cell (since unlike table btrees, index interior cells have payloads)
// We then mark going_upwards=false so that we go back down the tree on the next invocation.
self.going_upwards = false;
if predicate.is_none() {
let rowid = match self.get_immutable_record().as_ref().unwrap().last_value()
{
Some(RefValue::Integer(rowid)) => *rowid as u64,
_ => unreachable!("index cells should have an integer rowid"),
};
return Ok(CursorResult::Ok(Some(rowid)));
}
let (key, op) = predicate.as_ref().unwrap();
let SeekKey::IndexKey(index_key) = key else {
unreachable!("index seek key should be a record");
};
let order = {
let record = self.get_immutable_record();
let record = record.as_ref().unwrap();
let record_values = record.get_values();
let record_slice_same_num_cols =
&record_values[..index_key.get_values().len()];
let order =
compare_immutable(record_slice_same_num_cols, index_key.get_values());
order
};
let found = match op {
SeekOp::EQ => order.is_eq(),
SeekOp::LE => order.is_le(),
SeekOp::LT => order.is_lt(),
_ => unreachable!("Seek GT/GE should not happen in get_prev_record() because we are iterating backwards"),
};
if found {
let rowid = match self.get_immutable_record().as_ref().unwrap().last_value()
{
Some(RefValue::Integer(rowid)) => *rowid as u64,
_ => unreachable!("index cells should have an integer rowid"),
};
return Ok(CursorResult::Ok(Some(rowid)));
} else {
continue;
}
}
BTreeCell::IndexLeafCell(IndexLeafCell {
payload,
first_overflow_page,
payload_size,
}) => {
if let Some(next_page) = first_overflow_page {
return_if_io!(self.process_overflow_read(payload, next_page, payload_size))
} else {
crate::storage::sqlite3_ondisk::read_record(
payload,
self.get_immutable_record_or_create().as_mut().unwrap(),
)?
};
self.stack.retreat();
if predicate.is_none() {
let rowid = match self.get_immutable_record().as_ref().unwrap().last_value()
{
Some(RefValue::Integer(rowid)) => *rowid as u64,
_ => unreachable!("index cells should have an integer rowid"),
};
return Ok(CursorResult::Ok(Some(rowid)));
}
let (key, op) = predicate.as_ref().unwrap();
let SeekKey::IndexKey(index_key) = key else {
unreachable!("index seek key should be a record");
};
let order = {
let record = self.get_immutable_record();
let record = record.as_ref().unwrap();
let record_values = record.get_values();
let record_slice_same_num_cols =
&record_values[..index_key.get_values().len()];
let order =
compare_immutable(record_slice_same_num_cols, index_key.get_values());
order
};
let found = match op {
SeekOp::EQ => order.is_eq(),
SeekOp::LE => order.is_le(),
SeekOp::LT => order.is_lt(),
_ => unreachable!("Seek GT/GE should not happen in get_prev_record() because we are iterating backwards"),
};
if found {
let rowid = match self.get_immutable_record().as_ref().unwrap().last_value()
{
Some(RefValue::Integer(rowid)) => *rowid as u64,
_ => unreachable!("index cells should have an integer rowid"),
};
return Ok(CursorResult::Ok(Some(rowid)));
} else {
continue;
}
}
}
}
}
@ -720,6 +869,7 @@ impl BTreeCursor {
SeekOp::GT => order.is_gt(),
SeekOp::GE => order.is_ge(),
SeekOp::EQ => order.is_eq(),
_ => unreachable!("Seek LE/LT should not happen in get_next_record() because we are iterating forwards"),
};
if found {
let rowid = match self.get_immutable_record().as_ref().unwrap().last_value()
@ -771,6 +921,7 @@ impl BTreeCursor {
SeekOp::GT => order.is_lt(),
SeekOp::GE => order.is_le(),
SeekOp::EQ => order.is_le(),
_ => todo!("not implemented: {:?}", op),
};
if found {
let rowid = match self.get_immutable_record().as_ref().unwrap().last_value()
@ -792,6 +943,35 @@ impl BTreeCursor {
/// or e.g. find the first record greater than the seek key in a range query (e.g. SELECT * FROM table WHERE col > 10).
/// We don't include the rowid in the comparison and that's why the last value from the record is not included.
fn do_seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<Option<u64>>> {
assert!(
self.iteration_state != IterationState::Unset,
"iteration state must have been set before do_seek() is called"
);
let valid_op = match (self.iteration_state, op) {
(IterationState::Iterating(IterationDirection::Forwards), SeekOp::GE | SeekOp::GT) => {
true
}
(IterationState::Iterating(IterationDirection::Backwards), SeekOp::LE | SeekOp::LT) => {
true
}
(IterationState::IterationNotAllowed, SeekOp::EQ) => true,
_ => false,
};
assert!(
valid_op,
"invalid seek op for iteration state: {:?} {:?}",
self.iteration_state, op
);
let cell_iter_dir = match self.iteration_state {
IterationState::Iterating(IterationDirection::Forwards)
| IterationState::IterationNotAllowed => IterationDirection::Forwards,
IterationState::Iterating(IterationDirection::Backwards) => {
IterationDirection::Backwards
}
IterationState::Unset => {
unreachable!("iteration state must have been set before do_seek() is called");
}
};
return_if_io!(self.move_to(key.clone(), op.clone()));
{
@ -800,9 +980,27 @@ impl BTreeCursor {
let contents = page.get().contents.as_ref().unwrap();
for cell_idx in 0..contents.cell_count() {
let cell_count = contents.cell_count();
let mut cell_idx: isize = if cell_iter_dir == IterationDirection::Forwards {
0
} else {
cell_count as isize - 1
};
let end = if cell_iter_dir == IterationDirection::Forwards {
cell_count as isize - 1
} else {
0
};
self.stack.set_cell_index(cell_idx as i32);
while cell_count > 0
&& (if cell_iter_dir == IterationDirection::Forwards {
cell_idx <= end
} else {
cell_idx >= end
})
{
let cell = contents.cell_get(
cell_idx,
cell_idx as usize,
payload_overflow_threshold_max(
contents.page_type(),
self.usable_space() as u16,
@ -827,6 +1025,8 @@ impl BTreeCursor {
SeekOp::GT => *cell_rowid > rowid_key,
SeekOp::GE => *cell_rowid >= rowid_key,
SeekOp::EQ => *cell_rowid == rowid_key,
SeekOp::LE => *cell_rowid <= rowid_key,
SeekOp::LT => *cell_rowid < rowid_key,
};
if found {
if let Some(next_page) = first_overflow_page {
@ -841,10 +1041,10 @@ impl BTreeCursor {
self.get_immutable_record_or_create().as_mut().unwrap(),
)?
};
self.stack.advance();
self.stack.next_cell_in_direction(cell_iter_dir);
return Ok(CursorResult::Ok(Some(*cell_rowid)));
} else {
self.stack.advance();
self.stack.next_cell_in_direction(cell_iter_dir);
}
}
BTreeCell::IndexLeafCell(IndexLeafCell {
@ -869,14 +1069,17 @@ impl BTreeCursor {
};
let record = self.get_immutable_record();
let record = record.as_ref().unwrap();
let without_rowid = &record.get_values().as_slice()[..record.len() - 1];
let order = without_rowid.cmp(index_key.get_values());
let record_slice_equal_number_of_cols =
&record.get_values().as_slice()[..index_key.get_values().len()];
let order = record_slice_equal_number_of_cols.cmp(index_key.get_values());
let found = match op {
SeekOp::GT => order.is_gt(),
SeekOp::GE => order.is_ge(),
SeekOp::EQ => order.is_eq(),
SeekOp::LE => order.is_le(),
SeekOp::LT => order.is_lt(),
};
self.stack.advance();
self.stack.next_cell_in_direction(cell_iter_dir);
if found {
let rowid = match record.last_value() {
Some(RefValue::Integer(rowid)) => *rowid as u64,
@ -889,6 +1092,11 @@ impl BTreeCursor {
unreachable!("unexpected cell type: {:?}", cell_type);
}
}
if cell_iter_dir == IterationDirection::Forwards {
cell_idx += 1;
} else {
cell_idx -= 1;
}
}
}
@ -909,7 +1117,20 @@ impl BTreeCursor {
// if we were to return Ok(CursorResult::Ok((None, None))), self.record would be None, which is incorrect, because we already know
// that there is a record with a key greater than K (K' = K+2) in the parent interior cell. Hence, we need to move back up the tree
// and get the next matching record from there.
return self.get_next_record(Some((key, op)));
match self.iteration_state {
IterationState::Iterating(IterationDirection::Forwards) => {
return self.get_next_record(Some((key, op)));
}
IterationState::Iterating(IterationDirection::Backwards) => {
return self.get_prev_record(Some((key, op)));
}
IterationState::Unset => {
unreachable!("iteration state must not be unset");
}
IterationState::IterationNotAllowed => {
unreachable!("iteration state must not be IterationNotAllowed");
}
}
}
Ok(CursorResult::Ok(None))
@ -983,6 +1204,13 @@ impl BTreeCursor {
// 6. If we find the cell, we return the record. Otherwise, we return an empty result.
self.move_to_root();
let iter_dir = match self.iteration_state {
IterationState::Iterating(IterationDirection::Backwards) => {
IterationDirection::Backwards
}
_ => IterationDirection::Forwards,
};
loop {
let page = self.stack.top();
return_if_locked!(page);
@ -994,7 +1222,7 @@ impl BTreeCursor {
let mut found_cell = false;
for cell_idx in 0..contents.cell_count() {
match &contents.cell_get(
let cell = contents.cell_get(
cell_idx,
payload_overflow_threshold_max(
contents.page_type(),
@ -1005,25 +1233,78 @@ impl BTreeCursor {
self.usable_space() as u16,
),
self.usable_space(),
)? {
)?;
match &cell {
BTreeCell::TableInteriorCell(TableInteriorCell {
_left_child_page,
_rowid,
_rowid: cell_rowid,
}) => {
let SeekKey::TableRowId(rowid_key) = key else {
unreachable!("table seek key should be a rowid");
};
let target_leaf_page_is_in_left_subtree = match cmp {
SeekOp::GT => rowid_key < *_rowid,
SeekOp::GE => rowid_key <= *_rowid,
SeekOp::EQ => rowid_key <= *_rowid,
// in sqlite btrees left child pages have <= keys.
// table btrees can have a duplicate rowid in the interior cell, so for example if we are looking for rowid=10,
// and we find an interior cell with rowid=10, we need to move to the left page since (due to the <= rule of sqlite btrees)
// the left page may have a rowid=10.
// Logic table for determining if target leaf page is in left subtree
//
// Forwards iteration (looking for first match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// GT | > | go left | First > key is in left subtree
// GT | = or < | go right | First > key is in right subtree
// GE | > or = | go left | First >= key is in left subtree
// GE | < | go right | First >= key is in right subtree
//
// Backwards iteration (looking for last match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// LE | > or = | go left | Last <= key is in left subtree
// LE | < | go right | Last <= key is in right subtree
// LT | > or = | go left | Last < key is in left subtree
// LT | < | go right?| Last < key is in right subtree, except if cell rowid is exactly 1 less
//
// No iteration (point query):
// EQ | > or = | go left | Last = key is in left subtree
// EQ | < | go right | Last = key is in right subtree
let target_leaf_page_is_in_left_subtree = match (self.iteration_state, cmp)
{
(
IterationState::Iterating(IterationDirection::Forwards),
SeekOp::GT,
) => *cell_rowid > rowid_key,
(
IterationState::Iterating(IterationDirection::Forwards),
SeekOp::GE,
) => *cell_rowid >= rowid_key,
(
IterationState::Iterating(IterationDirection::Backwards),
SeekOp::LE,
) => *cell_rowid >= rowid_key,
(
IterationState::Iterating(IterationDirection::Backwards),
SeekOp::LT,
) => *cell_rowid >= rowid_key || *cell_rowid == rowid_key - 1,
(_any, SeekOp::EQ) => *cell_rowid >= rowid_key,
_ => unreachable!(
"invalid combination of seek op and iteration state: {:?} {:?}",
cmp, self.iteration_state
),
};
self.stack.advance();
if target_leaf_page_is_in_left_subtree {
// If we found our target rowid in the left subtree,
// we need to move the parent cell pointer forwards or backwards depending on the iteration direction.
// For example: since the internal node contains the max rowid of the left subtree, we need to move the
// parent pointer backwards in backwards iteration so that we don't come back to the parent again.
// E.g.
// this parent: rowid 666
// left child has: 664,665,666
// we need to move to the previous parent (with e.g. rowid 663) when iterating backwards.
self.stack.next_cell_in_direction(iter_dir);
let mem_page = self.pager.read_page(*_left_child_page as usize)?;
self.stack.push(mem_page);
found_cell = true;
break;
} else {
self.stack.advance();
}
}
BTreeCell::TableLeafCell(TableLeafCell {
@ -1057,17 +1338,84 @@ impl BTreeCursor {
self.get_immutable_record_or_create().as_mut().unwrap(),
)?
};
let order = compare_immutable(
let record = self.get_immutable_record();
let record = record.as_ref().unwrap();
let record_slice_equal_number_of_cols =
&record.get_values().as_slice()[..index_key.get_values().len()];
let interior_cell_vs_index_key = compare_immutable(
record_slice_equal_number_of_cols,
index_key.get_values(),
self.get_immutable_record().as_ref().unwrap().get_values(),
);
let target_leaf_page_is_in_the_left_subtree = match cmp {
SeekOp::GT => order.is_lt(),
SeekOp::GE => order.is_le(),
SeekOp::EQ => order.is_le(),
// in sqlite btrees left child pages have <= keys.
// in general, in forwards iteration we want to find the first key that matches the seek condition.
// in backwards iteration we want to find the last key that matches the seek condition.
//
// Logic table for determining if target leaf page is in left subtree.
// For index b-trees this is a bit more complicated since the interior cells contain payloads (the key is the payload).
// and for non-unique indexes there might be several cells with the same key.
//
// Forwards iteration (looking for first match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// GT | > | go left | First > key could be exactly this one, or in left subtree
// GT | = or < | go right | First > key must be in right subtree
// GE | > | go left | First >= key could be exactly this one, or in left subtree
// GE | = | go left | First >= key could be exactly this one, or in left subtree
// GE | < | go right | First >= key must be in right subtree
//
// Backwards iteration (looking for last match in tree):
// OP | Current Cell vs Seek Key | Action? | Explanation
// LE | > | go left | Last <= key must be in left subtree
// LE | = | go right | Last <= key is either this one, or somewhere to the right of this one. So we need to go right to make sure
// LE | < | go right | Last <= key must be in right subtree
// LT | > | go left | Last < key must be in left subtree
// LT | = | go left | Last < key must be in left subtree since we want strictly less than
// LT | < | go right | Last < key could be exactly this one, or in right subtree
//
// No iteration (point query):
// EQ | > | go left | First = key must be in left subtree
// EQ | = | go left | First = key could be exactly this one, or in left subtree
// EQ | < | go right | First = key must be in right subtree
assert!(
self.iteration_state != IterationState::Unset,
"iteration state must have been set before move_to() is called"
);
let target_leaf_page_is_in_left_subtree = match (cmp, self.iteration_state)
{
(
SeekOp::GT,
IterationState::Iterating(IterationDirection::Forwards),
) => interior_cell_vs_index_key.is_gt(),
(
SeekOp::GE,
IterationState::Iterating(IterationDirection::Forwards),
) => interior_cell_vs_index_key.is_ge(),
(SeekOp::EQ, IterationState::IterationNotAllowed) => {
interior_cell_vs_index_key.is_ge()
}
(
SeekOp::LE,
IterationState::Iterating(IterationDirection::Backwards),
) => interior_cell_vs_index_key.is_gt(),
(
SeekOp::LT,
IterationState::Iterating(IterationDirection::Backwards),
) => interior_cell_vs_index_key.is_ge(),
_ => unreachable!(
"invalid combination of seek op and iteration state: {:?} {:?}",
cmp, self.iteration_state
),
};
if target_leaf_page_is_in_the_left_subtree {
// we don't advance in case of index tree internal nodes because we will visit this node going up
if target_leaf_page_is_in_left_subtree {
// we don't advance in case of forward iteration and index tree internal nodes because we will visit this node going up.
// in backwards iteration, we must retreat because otherwise we would unnecessarily visit this node again.
// Example:
// this parent: key 666, and we found the target key in the left child.
// left child has: key 663, key 664, key 665
// we need to move to the previous parent (with e.g. key 662) when iterating backwards so that we don't end up back here again.
if iter_dir == IterationDirection::Backwards {
self.stack.retreat();
}
let mem_page = self.pager.read_page(*left_child_page as usize)?;
self.stack.push(mem_page);
found_cell = true;
@ -2607,6 +2955,14 @@ impl BTreeCursor {
}
pub fn rewind(&mut self) -> Result<CursorResult<()>> {
assert!(
matches!(
self.iteration_state,
IterationState::Unset | IterationState::Iterating(IterationDirection::Forwards)
),
"iteration state must be unset or Iterating(Forwards) when rewind() is called"
);
self.iteration_state = IterationState::Iterating(IterationDirection::Forwards);
if self.mv_cursor.is_some() {
let rowid = return_if_io!(self.get_next_record(None));
self.rowid.replace(rowid);
@ -2622,6 +2978,14 @@ impl BTreeCursor {
}
pub fn last(&mut self) -> Result<CursorResult<()>> {
assert!(
matches!(
self.iteration_state,
IterationState::Unset | IterationState::Iterating(IterationDirection::Backwards)
),
"iteration state must be unset or Iterating(Backwards) when last() is called"
);
self.iteration_state = IterationState::Iterating(IterationDirection::Backwards);
assert!(self.mv_cursor.is_none());
match self.move_to_rightmost()? {
CursorResult::Ok(_) => self.prev(),
@ -2630,6 +2994,14 @@ impl BTreeCursor {
}
pub fn next(&mut self) -> Result<CursorResult<()>> {
assert!(
matches!(
self.iteration_state,
IterationState::Iterating(IterationDirection::Forwards)
),
"iteration state must be Iterating(Forwards) when next() is called, but it was {:?}",
self.iteration_state
);
let rowid = return_if_io!(self.get_next_record(None));
self.rowid.replace(rowid);
self.empty_record.replace(rowid.is_none());
@ -2637,8 +3009,15 @@ impl BTreeCursor {
}
pub fn prev(&mut self) -> Result<CursorResult<()>> {
assert!(
matches!(
self.iteration_state,
IterationState::Iterating(IterationDirection::Backwards)
),
"iteration state must be Iterating(Backwards) when prev() is called"
);
assert!(self.mv_cursor.is_none());
match self.get_prev_record()? {
match self.get_prev_record(None)? {
CursorResult::Ok(rowid) => {
self.rowid.replace(rowid);
self.empty_record.replace(rowid.is_none());
@ -2663,6 +3042,38 @@ impl BTreeCursor {
pub fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<bool>> {
assert!(self.mv_cursor.is_none());
match op {
SeekOp::GE | SeekOp::GT => {
if self.iteration_state == IterationState::Unset {
self.iteration_state = IterationState::Iterating(IterationDirection::Forwards);
} else {
assert!(matches!(
self.iteration_state,
IterationState::Iterating(IterationDirection::Forwards)
));
}
}
SeekOp::LE | SeekOp::LT => {
if self.iteration_state == IterationState::Unset {
self.iteration_state = IterationState::Iterating(IterationDirection::Backwards);
} else {
assert!(matches!(
self.iteration_state,
IterationState::Iterating(IterationDirection::Backwards)
));
}
}
SeekOp::EQ => {
if self.iteration_state == IterationState::Unset {
self.iteration_state = IterationState::IterationNotAllowed;
} else {
assert!(matches!(
self.iteration_state,
IterationState::IterationNotAllowed
));
}
}
};
let rowid = return_if_io!(self.do_seek(key, op));
self.rowid.replace(rowid);
self.empty_record.replace(rowid.is_none());
@ -3025,7 +3436,7 @@ impl BTreeCursor {
/// Search for a key in an Index Btree. Looking up indexes that need to be unique, we cannot compare the rowid
pub fn key_exists_in_index(&mut self, key: &ImmutableRecord) -> Result<CursorResult<bool>> {
return_if_io!(self.do_seek(SeekKey::IndexKey(key), SeekOp::GE));
return_if_io!(self.seek(SeekKey::IndexKey(key), SeekOp::GE));
let record_opt = self.record();
match record_opt.as_ref() {
@ -3056,7 +3467,7 @@ impl BTreeCursor {
OwnedValue::Integer(i) => i,
_ => unreachable!("btree tables are indexed by integers!"),
};
return_if_io!(self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ));
let _ = return_if_io!(self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ));
let page = self.stack.top();
// TODO(pere): request load
return_if_locked!(page);
@ -3485,7 +3896,7 @@ impl PageStack {
}
/// Push a new page onto the stack.
/// This effectively means traversing to a child page.
fn push(&self, page: PageRef) {
fn _push(&self, page: PageRef, starting_cell_idx: i32) {
tracing::trace!(
"pagestack::push(current={}, new_page_id={})",
self.current_page.get(),
@ -3498,7 +3909,15 @@ impl PageStack {
"corrupted database, stack is bigger than expected"
);
self.stack.borrow_mut()[current as usize] = Some(page);
self.cell_indices.borrow_mut()[current as usize] = 0;
self.cell_indices.borrow_mut()[current as usize] = starting_cell_idx;
}
fn push(&self, page: PageRef) {
self._push(page, 0);
}
fn push_backwards(&self, page: PageRef) {
self._push(page, i32::MAX);
}
/// Pop a page off the stack.
@ -3558,6 +3977,18 @@ impl PageStack {
self.cell_indices.borrow_mut()[current] -= 1;
}
/// Move the cursor to the next cell in the current page according to the iteration direction.
fn next_cell_in_direction(&self, iteration_direction: IterationDirection) {
match iteration_direction {
IterationDirection::Forwards => {
self.advance();
}
IterationDirection::Backwards => {
self.retreat();
}
}
}
fn set_cell_index(&self, idx: i32) {
let current = self.current();
self.cell_indices.borrow_mut()[current] = idx
@ -4824,7 +5255,7 @@ mod tests {
run_until_done(
|| {
let key = SeekKey::TableRowId(key as u64);
cursor.move_to(key, SeekOp::EQ)
cursor.seek(key, SeekOp::EQ)
},
pager.deref(),
)
@ -4841,6 +5272,8 @@ mod tests {
// FIXME: add sorted vector instead, should be okay for small amounts of keys for now :P, too lazy to fix right now
keys.sort();
cursor.move_to_root();
// hack to allow bypassing our internal invariant of not allowing cursor iteration after SeekOp::EQ
cursor.iteration_state = IterationState::Iterating(IterationDirection::Forwards);
let mut valid = true;
for key in keys.iter() {
tracing::trace!("seeking key: {}", key);
@ -4852,6 +5285,7 @@ mod tests {
break;
}
}
cursor.iteration_state = IterationState::Unset;
// let's validate btree too so that we undertsand where the btree failed
if matches!(validate_btree(pager.clone(), root_page), (_, false)) || !valid {
let btree_after = format_btree(pager.clone(), root_page, 0);
@ -4869,6 +5303,8 @@ mod tests {
}
keys.sort();
cursor.move_to_root();
// hack to allow bypassing our internal invariant of not allowing cursor iteration after SeekOp::EQ
cursor.iteration_state = IterationState::Iterating(IterationDirection::Forwards);
for key in keys.iter() {
tracing::trace!("seeking key: {}", key);
run_until_done(|| cursor.next(), pager.deref()).unwrap();
@ -5740,7 +6176,7 @@ mod tests {
run_until_done(
|| {
let key = SeekKey::TableRowId(i as u64);
cursor.move_to(key, SeekOp::EQ)
cursor.seek(key, SeekOp::EQ)
},
pager.deref(),
)
@ -5820,7 +6256,7 @@ mod tests {
run_until_done(
|| {
let key = SeekKey::TableRowId(i as u64);
cursor.move_to(key, SeekOp::EQ)
cursor.seek(key, SeekOp::EQ)
},
pager.deref(),
)
@ -5902,7 +6338,7 @@ mod tests {
run_until_done(
|| {
let key = SeekKey::TableRowId(i as u64);
cursor.move_to(key, SeekOp::EQ)
cursor.seek(key, SeekOp::EQ)
},
pager.deref(),
)