Files
@ 13bac4f35619
Branch filter:
Location: CSY/reowolf/src/collections/string_pool.rs - annotation
13bac4f35619
4.5 KiB
application/rls-services+xml
WIP on tokenization
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 13bac4f35619 | use std::ptr::null_mut;
const SLAB_SIZE: usize = u16::max_value() as usize;
pub struct StringRef {
data: *const u8,
length: usize,
}
impl StringRef {
pub fn as_str<'a>(&'a self) -> &'a str {
unsafe {
let slice = std::slice::from_raw_parts::<'a, u8>(self.data, self.length);
std::str::from_utf8_unchecked(slice)
}
}
}
struct StringPoolSlab {
prev: *mut StringPoolSlab,
data: Vec<u8>,
remaining: usize,
}
impl StringPoolSlab {
fn new(prev: *mut StringPoolSlab) -> Self {
Self{ prev, data: Vec::with_capacity(SLAB_SIZE), remaining: SLAB_SIZE }
}
}
/// StringPool is a ever-growing pool of strings. Strings have a maximum size
/// equal to the slab size. The slabs are essentially a linked list to maintain
/// pointer-stability of the strings themselves.
/// All `StringRef` instances are invalidated when the string pool is dropped
pub(crate) struct StringPool {
last: *mut StringPoolSlab,
}
impl StringPool {
pub(crate) fn new() -> Self {
// To have some stability we just turn a box into a raw ptr.
let initial_slab = Box::new(StringPoolSlab::new(null_mut()));
let initial_slab = Box::into_raw(initial_slab);
StringPool{
last: initial_slab,
}
}
// Interns a string to the StringPool, returning a reference to it
pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef {
// TODO: Large string allocations, if ever needed.
let data_len = data.len();
assert!(data_len <= SLAB_SIZE, "string is too large for slab");
debug_assert!(std::str::from_utf8(data).is_ok(), "string to intern is not valid UTF-8 encoded");
let mut last = unsafe{&mut *self.last};
if data.len() > last.remaining {
// Doesn't fit: allocate new slab
self.alloc_new_slab();
last = unsafe{&mut *self.last};
}
// Must fit now
debug_assert!(data_len <= last.remaining);
let range_start = last.data.len();
last.data.extend_from_slice(data);
last.remaining -= data_len;
debug_assert_eq!(range_start + data_len, last.data.len());
unsafe {
let start = last.data.as_ptr().offset(range_start as isize);
StringRef{ data: start, length: data_len }
}
}
fn alloc_new_slab(&mut self) {
let new_slab = Box::new(StringPoolSlab::new(self.last));
let new_slab = Box::into_raw(new_slab);
self.last = new_slab;
}
}
impl Drop for StringPool {
fn drop(&mut self) {
let mut new_slab = self.last;
while !new_slab.is_null() {
let cur_slab = new_slab;
unsafe {
new_slab = (*cur_slab).prev;
Box::from_raw(cur_slab); // consume and deallocate
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_just_fits() {
let large = "0".repeat(SLAB_SIZE);
let mut pool = StringPool::new();
let interned = pool.intern(large.as_bytes());
assert_eq!(interned.as_str(), large);
}
#[test]
#[should_panic]
fn test_string_too_large() {
let large = "0".repeat(SLAB_SIZE + 1);
let mut pool = StringPool::new();
let _interned = pool.intern(large.as_bytes());
}
#[test]
fn test_lots_of_small_allocations() {
const NUM_PER_SLAB: usize = 32;
const NUM_SLABS: usize = 4;
let to_intern = "0".repeat(SLAB_SIZE / NUM_PER_SLAB);
let mut pool = StringPool::new();
let mut last_slab = pool.last;
let mut all_refs = Vec::new();
// Fill up first slab
for _alloc_idx in 0..NUM_PER_SLAB {
let interned = pool.intern(to_intern.as_bytes());
all_refs.push(interned);
assert!(std::ptr::eq(last_slab, pool.last));
}
for _slab_idx in 0..NUM_SLABS-1 {
for alloc_idx in 0..NUM_PER_SLAB {
let interned = pool.intern(to_intern.as_bytes());
all_refs.push(interned);
if alloc_idx == 0 {
// First allocation produces a new slab
assert!(!std::ptr::eq(last_slab, pool.last));
last_slab = pool.last;
} else {
assert!(std::ptr::eq(last_slab, pool.last));
}
}
}
// All strings are still correct
for string_ref in all_refs {
assert_eq!(string_ref.as_str(), to_intern);
}
}
}
|