Implement zero-copy tokenization for Word, SingleQuotedString, and Whitespace

Convert token string fields to use Cow<'a, str> to enable zero-copy tokenization
  for commonly used tokens:
  - Word.value: Regular identifiers and keywords now borrow from source
  - SingleQuotedString: String literals borrow when no escape processing needed
  - Whitespace: Single-line and multi-line comments borrow from source

Also add benchmark for measuring tokenization performance
This commit is contained in:
Eyal Leshem 2025-12-03 16:05:06 +02:00
parent 0f17b327b9
commit 5458a2b21d
11 changed files with 1288 additions and 224 deletions

5
.gitignore vendored
View file

@ -18,4 +18,7 @@ Cargo.lock
*.swp
.DS_store
.DS_store
# dhat profiler output files
dhat*.json

View file

@ -48,6 +48,7 @@ visitor = ["sqlparser_derive"]
bigdecimal = { version = "0.4.1", features = ["serde"], optional = true }
log = "0.4"
recursive = { version = "0.1.1", optional = true}
unicase = "2.7"
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
# serde_json is only used in examples/cli, but we have to put it outside
@ -60,7 +61,12 @@ sqlparser_derive = { version = "0.4.0", path = "derive", optional = true }
simple_logger = "5.0"
matches = "0.1"
pretty_assertions = "1"
sysinfo = "0.30"
dhat = "0.3.3"
criterion = "0.5"
[package.metadata.docs.rs]
# Document these features on docs.rs
features = ["serde", "visitor"]
features = ["serde", "visitor"]

View file

@ -31,3 +31,7 @@ criterion = "0.7"
[[bench]]
name = "sqlparser_bench"
harness = false
[[bench]]
name = "tokenize_bench"
harness = false

View file

@ -0,0 +1,862 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Benchmark tokenization performance
//!
//! This benchmark measures tokenization speed using a complex SQL query
//! with many identifiers, keywords, string literals, and comments.
use criterion::{criterion_group, criterion_main, Criterion};
use sqlparser::dialect::GenericDialect;
use sqlparser::tokenizer::Tokenizer;
const COMPLEX_SQL: &str = r#"
-- ============================================================================
-- Enterprise Sales Analytics Dashboard Query
-- ============================================================================
-- Purpose: Comprehensive sales analysis across multiple dimensions
-- Author: Analytics Team
-- Last Modified: 2024-01-15
-- ============================================================================
/*
* This query aggregates sales data from multiple sources:
* - Customer transactions and lifetime value
* - Product performance across categories
* - Regional sales trends and patterns
* - Employee commission calculations
* - Inventory and fulfillment metrics
*/
WITH customer_segments AS (
-- Segment customers by purchase behavior and demographics
SELECT
customer_id,
customer_number,
customer_name,
customer_type,
customer_status,
customer_tier,
email_address,
phone_number,
mobile_number,
fax_number,
date_of_birth,
registration_date,
last_login_date,
account_status,
email_verified,
phone_verified,
-- Address information
billing_address_line1,
billing_address_line2,
billing_city,
billing_state,
billing_postal_code,
billing_country,
shipping_address_line1,
shipping_address_line2,
shipping_city,
shipping_state,
shipping_postal_code,
shipping_country,
-- Demographics
gender,
age_group,
income_bracket,
education_level,
occupation,
marital_status,
-- Marketing preferences
marketing_opt_in,
sms_opt_in,
email_frequency,
preferred_channel,
preferred_language,
-- Calculated fields
CASE
WHEN customer_status = 'active' AND last_login_date >= CURRENT_DATE - INTERVAL '30' DAY THEN 'highly_active'
WHEN customer_status = 'active' AND last_login_date >= CURRENT_DATE - INTERVAL '90' DAY THEN 'active'
WHEN customer_status = 'active' THEN 'inactive'
ELSE 'dormant'
END AS activity_level,
CASE
WHEN registration_date >= CURRENT_DATE - INTERVAL '1' YEAR THEN 'new'
WHEN registration_date >= CURRENT_DATE - INTERVAL '3' YEAR THEN 'established'
ELSE 'veteran'
END AS customer_tenure
FROM customers
WHERE customer_status IN ('active', 'pending', 'suspended')
AND registration_date >= '2020-01-01'
AND billing_country IN ('USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Spain', 'Italy')
AND email_address NOT LIKE '%@test.com'
AND email_address NOT LIKE '%@example.com'
AND customer_name IS NOT NULL
),
product_catalog AS (
-- Product information with categories and attributes
SELECT
product_id,
product_sku,
product_name,
product_description,
product_category,
product_subcategory,
product_brand,
product_manufacturer,
product_supplier,
product_model,
product_series,
product_version,
-- Pricing
list_price,
cost_price,
sale_price,
wholesale_price,
minimum_price,
suggested_retail_price,
-- Attributes
product_color,
product_size,
product_weight,
product_length,
product_width,
product_height,
product_material,
product_warranty,
-- Inventory
stock_quantity,
reorder_level,
reorder_quantity,
warehouse_location,
bin_location,
aisle_number,
shelf_number,
-- Status
product_status,
availability_status,
is_featured,
is_new_arrival,
is_on_sale,
is_clearance,
is_discontinued,
launch_date,
discontinuation_date,
-- Ratings
average_rating,
review_count,
return_rate,
defect_rate,
-- Categories
CASE
WHEN product_category = 'electronics' THEN 'high_tech'
WHEN product_category IN ('clothing', 'shoes', 'accessories') THEN 'fashion'
WHEN product_category IN ('home', 'garden', 'furniture') THEN 'home_living'
WHEN product_category IN ('sports', 'outdoor', 'fitness') THEN 'active_lifestyle'
ELSE 'general_merchandise'
END AS category_group
FROM products
WHERE product_status = 'active'
AND availability_status IN ('in_stock', 'low_stock', 'backorder')
AND is_discontinued = FALSE
AND launch_date <= CURRENT_DATE
),
order_transactions AS (
-- Order and transaction details
SELECT
order_id,
order_number,
order_date,
order_time,
order_timestamp,
customer_id,
order_status,
order_type,
order_channel,
order_source,
-- Payment information
payment_method,
payment_status,
payment_date,
payment_reference,
transaction_id,
authorization_code,
-- Financial details
subtotal_amount,
tax_amount,
shipping_amount,
discount_amount,
coupon_amount,
gift_card_amount,
total_amount,
paid_amount,
refund_amount,
net_amount,
-- Shipping details
shipping_method,
shipping_carrier,
tracking_number,
shipped_date,
estimated_delivery_date,
actual_delivery_date,
delivery_status,
signature_required,
-- Location
ship_to_address_line1,
ship_to_address_line2,
ship_to_city,
ship_to_state,
ship_to_postal_code,
ship_to_country,
-- Fulfillment
warehouse_id,
fulfillment_center,
picker_id,
packer_id,
shipper_id,
-- Timestamps
created_at,
updated_at,
completed_at,
cancelled_at,
-- Flags
is_gift,
is_rush_order,
is_international,
requires_signature,
is_business_order,
-- Notes
customer_notes,
internal_notes,
gift_message,
special_instructions
FROM orders
WHERE order_date >= '2023-01-01'
AND order_date < '2024-12-31'
AND order_status IN ('pending', 'processing', 'shipped', 'delivered', 'completed')
AND order_type IN ('standard', 'express', 'overnight', 'international')
AND total_amount > 0
AND customer_id IS NOT NULL
),
order_line_items AS (
-- Individual line items from orders
SELECT
line_item_id,
order_id,
product_id,
line_number,
-- Quantities
quantity_ordered,
quantity_shipped,
quantity_cancelled,
quantity_returned,
quantity_damaged,
-- Pricing
unit_price,
unit_cost,
unit_discount,
line_subtotal,
line_tax,
line_shipping,
line_total,
-- Discounts
discount_type,
discount_code,
discount_percentage,
discount_reason,
-- Product details at time of order
product_sku_snapshot,
product_name_snapshot,
product_category_snapshot,
-- Status
line_status,
fulfillment_status,
return_status,
-- Warehouse
picked_from_warehouse,
picked_from_location,
picked_by_user,
picked_at_timestamp,
packed_by_user,
packed_at_timestamp,
-- Returns
return_reason,
return_date,
refund_amount,
restocking_fee,
-- Gift wrap
is_gift_wrapped,
gift_wrap_type,
gift_wrap_charge,
-- Calculated fields
unit_price * quantity_ordered AS line_revenue,
unit_cost * quantity_ordered AS line_cost,
(unit_price - unit_cost) * quantity_ordered AS line_profit,
CASE
WHEN quantity_returned > 0 THEN 'returned'
WHEN quantity_cancelled > 0 THEN 'cancelled'
WHEN quantity_shipped = quantity_ordered THEN 'fulfilled'
ELSE 'partial'
END AS fulfillment_type
FROM order_items
WHERE line_status NOT IN ('cancelled', 'voided')
AND quantity_ordered > 0
),
employee_data AS (
-- Employee and sales representative information
SELECT
employee_id,
employee_number,
employee_name,
first_name,
last_name,
middle_name,
email_address,
phone_extension,
mobile_phone,
-- Employment details
hire_date,
termination_date,
employment_status,
employment_type,
job_title,
job_level,
job_grade,
department_id,
department_name,
division_id,
division_name,
-- Management
manager_id,
manager_name,
reports_to,
-- Location
office_location,
office_building,
office_floor,
office_room,
work_city,
work_state,
work_country,
-- Compensation
base_salary,
commission_rate,
bonus_target,
commission_tier,
-- Performance
sales_quota,
current_sales,
quota_attainment,
performance_rating,
last_review_date,
next_review_date
FROM employees
WHERE employment_status = 'active'
AND employee_id IS NOT NULL
AND hire_date <= CURRENT_DATE
),
customer_lifetime_metrics AS (
-- Calculate customer lifetime value and metrics
SELECT
cs.customer_id,
cs.customer_name,
cs.customer_tier,
cs.activity_level,
-- Order counts
COUNT(DISTINCT ot.order_id) AS total_orders,
COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '30' DAY THEN ot.order_id END) AS orders_last_30_days,
COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '90' DAY THEN ot.order_id END) AS orders_last_90_days,
COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.order_id END) AS orders_last_year,
-- Revenue metrics
SUM(ot.total_amount) AS lifetime_revenue,
SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '30' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_30_days,
SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '90' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_90_days,
SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_year,
-- Average values
AVG(ot.total_amount) AS average_order_value,
AVG(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.total_amount END) AS avg_order_value_last_year,
-- Product metrics
COUNT(DISTINCT oli.product_id) AS unique_products_purchased,
SUM(oli.quantity_ordered) AS total_items_purchased,
-- Return metrics
SUM(oli.quantity_returned) AS total_items_returned,
SUM(CASE WHEN oli.quantity_returned > 0 THEN oli.refund_amount ELSE 0 END) AS total_refund_amount,
-- Date ranges
MIN(ot.order_date) AS first_order_date,
MAX(ot.order_date) AS last_order_date,
MAX(ot.order_date) - MIN(ot.order_date) AS customer_lifespan_days,
-- Recency
CURRENT_DATE - MAX(ot.order_date) AS days_since_last_order
FROM customer_segments cs
LEFT JOIN order_transactions ot ON cs.customer_id = ot.customer_id
LEFT JOIN order_line_items oli ON ot.order_id = oli.order_id
WHERE ot.order_status IN ('delivered', 'completed')
GROUP BY
cs.customer_id,
cs.customer_name,
cs.customer_tier,
cs.activity_level
),
product_performance AS (
-- Product sales performance metrics
SELECT
pc.product_id,
pc.product_sku,
pc.product_name,
pc.product_category,
pc.product_subcategory,
pc.product_brand,
pc.category_group,
-- Sales metrics
COUNT(DISTINCT oli.order_id) AS total_orders,
SUM(oli.quantity_ordered) AS total_quantity_sold,
SUM(oli.quantity_returned) AS total_quantity_returned,
SUM(oli.line_revenue) AS total_revenue,
SUM(oli.line_cost) AS total_cost,
SUM(oli.line_profit) AS total_profit,
-- Averages
AVG(oli.unit_price) AS average_selling_price,
AVG(oli.line_revenue) AS average_line_revenue,
-- Return rate
CAST(SUM(oli.quantity_returned) AS DECIMAL) / NULLIF(SUM(oli.quantity_ordered), 0) AS return_rate,
-- Profit margin
CAST(SUM(oli.line_profit) AS DECIMAL) / NULLIF(SUM(oli.line_revenue), 0) AS profit_margin,
-- Rankings
RANK() OVER (PARTITION BY pc.product_category ORDER BY SUM(oli.line_revenue) DESC) AS revenue_rank_in_category,
RANK() OVER (ORDER BY SUM(oli.quantity_ordered) DESC) AS quantity_rank_overall
FROM product_catalog pc
INNER JOIN order_line_items oli ON pc.product_id = oli.product_id
INNER JOIN order_transactions ot ON oli.order_id = ot.order_id
WHERE ot.order_status IN ('delivered', 'completed')
AND ot.order_date >= '2023-01-01'
GROUP BY
pc.product_id,
pc.product_sku,
pc.product_name,
pc.product_category,
pc.product_subcategory,
pc.product_brand,
pc.category_group
),
regional_sales AS (
-- Sales performance by region
SELECT
cs.billing_country,
cs.billing_state,
cs.billing_city,
-- Order metrics
COUNT(DISTINCT ot.order_id) AS total_orders,
COUNT(DISTINCT cs.customer_id) AS unique_customers,
-- Revenue
SUM(ot.total_amount) AS total_revenue,
SUM(ot.shipping_amount) AS total_shipping_revenue,
SUM(ot.tax_amount) AS total_tax_collected,
AVG(ot.total_amount) AS average_order_value,
-- Time periods
SUM(CASE WHEN ot.order_date >= '2024-01-01' THEN ot.total_amount ELSE 0 END) AS revenue_2024,
SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END) AS revenue_2023,
-- Growth
(SUM(CASE WHEN ot.order_date >= '2024-01-01' THEN ot.total_amount ELSE 0 END) -
SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END)) /
NULLIF(SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END), 0) AS year_over_year_growth
FROM customer_segments cs
INNER JOIN order_transactions ot ON cs.customer_id = ot.customer_id
WHERE ot.order_status IN ('delivered', 'completed')
GROUP BY
cs.billing_country,
cs.billing_state,
cs.billing_city
HAVING SUM(ot.total_amount) > 1000
),
monthly_trends AS (
-- Monthly sales trends and seasonality
SELECT
DATE_TRUNC('month', ot.order_date) AS order_month,
EXTRACT(YEAR FROM ot.order_date) AS order_year,
EXTRACT(MONTH FROM ot.order_date) AS month_number,
EXTRACT(QUARTER FROM ot.order_date) AS quarter_number,
-- Volume metrics
COUNT(DISTINCT ot.order_id) AS orders,
COUNT(DISTINCT ot.customer_id) AS customers,
SUM(oli.quantity_ordered) AS items_sold,
-- Financial metrics
SUM(ot.subtotal_amount) AS subtotal,
SUM(ot.tax_amount) AS tax,
SUM(ot.shipping_amount) AS shipping,
SUM(ot.discount_amount) AS discounts,
SUM(ot.total_amount) AS revenue,
-- Averages
AVG(ot.total_amount) AS avg_order_value,
AVG(oli.quantity_ordered) AS avg_items_per_order,
-- Moving averages
AVG(SUM(ot.total_amount)) OVER (ORDER BY DATE_TRUNC('month', ot.order_date) ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS three_month_moving_avg,
AVG(SUM(ot.total_amount)) OVER (ORDER BY DATE_TRUNC('month', ot.order_date) ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS six_month_moving_avg
FROM order_transactions ot
INNER JOIN order_line_items oli ON ot.order_id = oli.order_id
WHERE ot.order_status IN ('delivered', 'completed')
AND ot.order_date >= '2022-01-01'
GROUP BY
DATE_TRUNC('month', ot.order_date),
EXTRACT(YEAR FROM ot.order_date),
EXTRACT(MONTH FROM ot.order_date),
EXTRACT(QUARTER FROM ot.order_date)
),
category_analysis AS (
-- Category performance analysis
SELECT
pc.product_category,
pc.product_subcategory,
pc.category_group,
-- Sales
COUNT(DISTINCT oli.order_id) AS orders,
SUM(oli.quantity_ordered) AS quantity,
SUM(oli.line_revenue) AS revenue,
SUM(oli.line_profit) AS profit,
-- Market share
SUM(oli.line_revenue) / SUM(SUM(oli.line_revenue)) OVER () AS revenue_share,
-- Pricing
AVG(oli.unit_price) AS avg_price,
MIN(oli.unit_price) AS min_price,
MAX(oli.unit_price) AS max_price,
-- Profitability
SUM(oli.line_profit) / NULLIF(SUM(oli.line_revenue), 0) AS profit_margin,
-- Returns
SUM(oli.quantity_returned) AS returns,
CAST(SUM(oli.quantity_returned) AS DECIMAL) / NULLIF(SUM(oli.quantity_ordered), 0) AS return_rate
FROM product_catalog pc
INNER JOIN order_line_items oli ON pc.product_id = oli.product_id
INNER JOIN order_transactions ot ON oli.order_id = ot.order_id
WHERE ot.order_status IN ('delivered', 'completed')
GROUP BY
pc.product_category,
pc.product_subcategory,
pc.category_group
)
-- Main query combining all CTEs
SELECT
-- Customer information
cs.customer_id,
cs.customer_number,
cs.customer_name,
cs.customer_type,
cs.customer_tier,
cs.activity_level,
cs.customer_tenure,
cs.email_address,
cs.phone_number,
cs.billing_city,
cs.billing_state,
cs.billing_country,
cs.age_group,
cs.gender,
cs.income_bracket,
-- Customer metrics
clm.total_orders,
clm.orders_last_30_days,
clm.orders_last_90_days,
clm.orders_last_year,
clm.lifetime_revenue,
clm.revenue_last_30_days,
clm.revenue_last_90_days,
clm.revenue_last_year,
clm.average_order_value,
clm.unique_products_purchased,
clm.total_items_purchased,
clm.total_items_returned,
clm.first_order_date,
clm.last_order_date,
clm.days_since_last_order,
-- Order details
ot.order_id,
ot.order_number,
ot.order_date,
ot.order_status,
ot.order_type,
ot.order_channel,
ot.payment_method,
ot.payment_status,
ot.subtotal_amount,
ot.tax_amount,
ot.shipping_amount,
ot.discount_amount,
ot.total_amount,
ot.shipping_method,
ot.shipping_carrier,
ot.tracking_number,
ot.delivery_status,
-- Line item details
oli.line_item_id,
oli.product_id,
oli.quantity_ordered,
oli.quantity_shipped,
oli.unit_price,
oli.line_total,
oli.discount_type,
oli.line_status,
-- Product information
pc.product_sku,
pc.product_name,
pc.product_category,
pc.product_subcategory,
pc.product_brand,
pc.product_manufacturer,
pc.category_group,
pc.list_price,
pc.product_color,
pc.product_size,
pc.average_rating,
pc.review_count,
-- Product performance
pp.total_quantity_sold AS product_total_quantity_sold,
pp.total_revenue AS product_total_revenue,
pp.total_profit AS product_total_profit,
pp.return_rate AS product_return_rate,
pp.profit_margin AS product_profit_margin,
pp.revenue_rank_in_category,
-- Employee information
ed.employee_id,
ed.employee_name,
ed.job_title,
ed.department_name,
ed.office_location,
ed.commission_rate,
ed.sales_quota,
-- Regional metrics
rs.total_orders AS region_total_orders,
rs.unique_customers AS region_unique_customers,
rs.total_revenue AS region_total_revenue,
rs.average_order_value AS region_avg_order_value,
rs.year_over_year_growth AS region_yoy_growth,
-- Category metrics
ca.revenue AS category_revenue,
ca.profit AS category_profit,
ca.revenue_share AS category_revenue_share,
ca.profit_margin AS category_profit_margin,
ca.return_rate AS category_return_rate,
-- Monthly trends
mt.order_month,
mt.three_month_moving_avg,
mt.six_month_moving_avg,
-- Calculated fields
CASE
WHEN clm.lifetime_revenue > 10000 THEN 'vip'
WHEN clm.lifetime_revenue > 5000 THEN 'premium'
WHEN clm.lifetime_revenue > 1000 THEN 'standard'
ELSE 'basic'
END AS calculated_tier,
CASE
WHEN clm.days_since_last_order <= 30 THEN 'very_recent'
WHEN clm.days_since_last_order <= 90 THEN 'recent'
WHEN clm.days_since_last_order <= 180 THEN 'moderate'
ELSE 'at_risk'
END AS recency_segment,
CASE
WHEN clm.total_orders >= 50 THEN 'frequent'
WHEN clm.total_orders >= 20 THEN 'regular'
WHEN clm.total_orders >= 5 THEN 'occasional'
ELSE 'rare'
END AS frequency_segment,
oli.unit_price * oli.quantity_ordered AS calculated_line_revenue,
(oli.unit_price * oli.quantity_ordered) * (ed.commission_rate / 100) AS calculated_commission,
ROUND(oli.unit_price * oli.quantity_ordered * 0.9, 2) AS discounted_line_total,
-- Window functions
ROW_NUMBER() OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date DESC) AS order_recency_rank,
RANK() OVER (PARTITION BY cs.billing_country ORDER BY clm.lifetime_revenue DESC) AS customer_value_rank_in_country,
DENSE_RANK() OVER (PARTITION BY pc.product_category ORDER BY oli.quantity_ordered DESC) AS product_popularity_rank,
SUM(ot.total_amount) OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_customer_revenue,
AVG(ot.total_amount) OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS rolling_5_order_avg,
-- Aggregates
SUM(oli.quantity_ordered) OVER (PARTITION BY pc.product_category) AS category_total_quantity,
COUNT(DISTINCT ot.order_id) OVER (PARTITION BY cs.billing_country, DATE_TRUNC('month', ot.order_date)) AS monthly_orders_in_country,
MAX(ot.total_amount) OVER (PARTITION BY cs.customer_id) AS customer_largest_order,
MIN(ot.order_date) OVER (PARTITION BY pc.product_id) AS product_first_sale_date
FROM customer_segments cs
INNER JOIN customer_lifetime_metrics clm ON cs.customer_id = clm.customer_id
INNER JOIN order_transactions ot ON cs.customer_id = ot.customer_id
INNER JOIN order_line_items oli ON ot.order_id = oli.order_id
INNER JOIN product_catalog pc ON oli.product_id = pc.product_id
INNER JOIN product_performance pp ON pc.product_id = pp.product_id
LEFT JOIN employee_data ed ON ot.order_id IN (
SELECT order_id FROM employee_assignments WHERE employee_id = ed.employee_id
)
LEFT JOIN regional_sales rs ON cs.billing_country = rs.billing_country
AND cs.billing_state = rs.billing_state
AND cs.billing_city = rs.billing_city
LEFT JOIN category_analysis ca ON pc.product_category = ca.product_category
AND pc.product_subcategory = ca.product_subcategory
LEFT JOIN monthly_trends mt ON DATE_TRUNC('month', ot.order_date) = mt.order_month
WHERE
-- Date filters
ot.order_date >= '2023-01-01'
AND ot.order_date < '2024-12-31'
-- Status filters
AND ot.order_status IN ('processing', 'shipped', 'delivered', 'completed')
AND oli.line_status NOT IN ('cancelled', 'voided', 'rejected')
AND cs.customer_status = 'active'
AND pc.product_status = 'active'
-- Geographic filters
AND cs.billing_country IN ('USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Spain', 'Italy', 'Japan', 'Australia')
AND cs.billing_state NOT IN ('test', 'demo', 'internal')
-- Category filters
AND pc.product_category IN ('electronics', 'clothing', 'home', 'sports', 'books', 'toys', 'automotive', 'health', 'beauty', 'grocery')
AND pc.product_subcategory NOT LIKE '%test%'
-- Amount filters
AND ot.total_amount > 0
AND ot.total_amount < 100000
AND oli.quantity_ordered > 0
AND oli.unit_price > 0
-- Quality filters
AND cs.email_address NOT LIKE '%@test.com'
AND cs.email_address NOT LIKE '%@example.com'
AND cs.email_address NOT LIKE '%@invalid.com'
AND cs.customer_name NOT LIKE '%test%'
AND cs.customer_name NOT LIKE '%demo%'
AND pc.product_name NOT LIKE '%sample%'
AND pc.product_name NOT LIKE '%demo%'
-- Tier filters
AND cs.customer_tier IN ('gold', 'silver', 'bronze', 'platinum')
AND cs.activity_level IN ('highly_active', 'active')
-- Payment filters
AND ot.payment_status = 'completed'
AND ot.payment_method IN ('credit_card', 'debit_card', 'paypal', 'apple_pay', 'google_pay', 'bank_transfer')
-- Shipping filters
AND ot.delivery_status IN ('delivered', 'in_transit', 'out_for_delivery')
AND ot.shipping_method IN ('standard', 'express', 'overnight', 'two_day')
-- Channel filters
AND ot.order_channel IN ('web', 'mobile', 'tablet', 'phone', 'store', 'marketplace')
-- Null checks
AND cs.customer_id IS NOT NULL
AND ot.order_id IS NOT NULL
AND oli.product_id IS NOT NULL
AND pc.product_sku IS NOT NULL
AND ot.total_amount IS NOT NULL
GROUP BY
cs.customer_id, cs.customer_number, cs.customer_name, cs.customer_type, cs.customer_tier,
cs.activity_level, cs.customer_tenure, cs.email_address, cs.phone_number,
cs.billing_city, cs.billing_state, cs.billing_country, cs.age_group, cs.gender, cs.income_bracket,
clm.total_orders, clm.orders_last_30_days, clm.orders_last_90_days, clm.orders_last_year,
clm.lifetime_revenue, clm.revenue_last_30_days, clm.revenue_last_90_days, clm.revenue_last_year,
clm.average_order_value, clm.unique_products_purchased, clm.total_items_purchased,
clm.total_items_returned, clm.first_order_date, clm.last_order_date, clm.days_since_last_order,
ot.order_id, ot.order_number, ot.order_date, ot.order_status, ot.order_type, ot.order_channel,
ot.payment_method, ot.payment_status, ot.subtotal_amount, ot.tax_amount, ot.shipping_amount,
ot.discount_amount, ot.total_amount, ot.shipping_method, ot.shipping_carrier, ot.tracking_number,
ot.delivery_status, oli.line_item_id, oli.product_id, oli.quantity_ordered, oli.quantity_shipped,
oli.unit_price, oli.line_total, oli.discount_type, oli.line_status,
pc.product_sku, pc.product_name, pc.product_category, pc.product_subcategory, pc.product_brand,
pc.product_manufacturer, pc.category_group, pc.list_price, pc.product_color, pc.product_size,
pc.average_rating, pc.review_count, pp.total_quantity_sold, pp.total_revenue, pp.total_profit,
pp.return_rate, pp.profit_margin, pp.revenue_rank_in_category,
ed.employee_id, ed.employee_name, ed.job_title, ed.department_name, ed.office_location,
ed.commission_rate, ed.sales_quota, rs.total_orders, rs.unique_customers, rs.total_revenue,
rs.average_order_value, rs.year_over_year_growth,
ca.revenue, ca.profit, ca.revenue_share, ca.profit_margin, ca.return_rate,
mt.order_month, mt.three_month_moving_avg, mt.six_month_moving_avg
HAVING
SUM(oli.quantity_ordered) > 0
AND SUM(oli.line_total) > 0
AND COUNT(DISTINCT ot.order_id) >= 1
ORDER BY
clm.lifetime_revenue DESC,
clm.total_orders DESC,
ot.order_date DESC,
cs.customer_name ASC,
pc.product_category ASC,
pc.product_name ASC,
oli.line_number ASC,
ot.order_id ASC
LIMIT 100000
OFFSET 0;
-- Additional analytics queries for dashboard
-- Top customers by revenue
SELECT
customer_id,
customer_name,
customer_tier,
total_orders,
lifetime_revenue,
average_order_value,
days_since_last_order
FROM customer_lifetime_metrics
WHERE lifetime_revenue > 1000
ORDER BY lifetime_revenue DESC
LIMIT 100;
-- Top products by sales
SELECT
product_sku,
product_name,
product_category,
product_brand,
total_quantity_sold,
total_revenue,
total_profit,
profit_margin,
return_rate
FROM product_performance
WHERE total_revenue > 5000
ORDER BY total_revenue DESC
LIMIT 50;
-- Regional performance summary
SELECT
billing_country,
billing_state,
total_orders,
unique_customers,
total_revenue,
average_order_value,
year_over_year_growth
FROM regional_sales
WHERE total_revenue > 10000
ORDER BY total_revenue DESC;
"#;
fn tokenization_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("tokenization");
let dialect = GenericDialect {};
group.bench_function("tokenize_complex_sql", |b| {
b.iter(|| {
let mut tokenizer = Tokenizer::new(&dialect, COMPLEX_SQL);
tokenizer.tokenize().unwrap()
});
});
group.finish();
}
criterion_group!(benches, tokenization_benchmark);
criterion_main!(benches);

View file

@ -1251,7 +1251,7 @@ pub fn parse_copy_into(parser: &Parser) -> Result<Statement, ParserError> {
continue_loop = false;
let next_token = parser.next_token();
match next_token.token {
BorrowedToken::SingleQuotedString(s) => files.push(s),
BorrowedToken::SingleQuotedString(s) => files.push(s.into_owned()),
_ => parser.expected("file token", next_token)?,
};
if parser.next_token().token.eq(&BorrowedToken::Comma) {
@ -1266,7 +1266,7 @@ pub fn parse_copy_into(parser: &Parser) -> Result<Statement, ParserError> {
parser.expect_token(&BorrowedToken::Eq)?;
let next_token = parser.next_token();
pattern = Some(match next_token.token {
BorrowedToken::SingleQuotedString(s) => s,
BorrowedToken::SingleQuotedString(s) => s.into_owned(),
_ => parser.expected("pattern", next_token)?,
});
// VALIDATION MODE
@ -1417,7 +1417,7 @@ fn parse_stage_params(parser: &Parser) -> Result<StageParamsObject, ParserError>
if parser.parse_keyword(Keyword::URL) {
parser.expect_token(&BorrowedToken::Eq)?;
url = Some(match parser.next_token().token {
BorrowedToken::SingleQuotedString(word) => Ok(word),
BorrowedToken::SingleQuotedString(word) => Ok(word.into_owned()),
_ => parser.expected("a URL statement", parser.peek_token()),
}?)
}
@ -1432,7 +1432,7 @@ fn parse_stage_params(parser: &Parser) -> Result<StageParamsObject, ParserError>
if parser.parse_keyword(Keyword::ENDPOINT) {
parser.expect_token(&BorrowedToken::Eq)?;
endpoint = Some(match parser.next_token().token {
BorrowedToken::SingleQuotedString(word) => Ok(word),
BorrowedToken::SingleQuotedString(word) => Ok(word.into_owned()),
_ => parser.expected("an endpoint statement", parser.peek_token()),
}?)
}
@ -1486,7 +1486,7 @@ fn parse_session_options(parser: &Parser, set: bool) -> Result<Vec<KeyValueOptio
options.push(option);
} else {
options.push(KeyValueOption {
option_name: key.value,
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::Single(Value::Placeholder(empty())),
});
}

View file

@ -1248,3 +1248,57 @@ pub const RESERVED_FOR_IDENTIFIER: &[Keyword] = &[
Keyword::STRUCT,
Keyword::TRIM,
];
#[cfg(feature = "std")]
use std::collections::HashMap;
#[cfg(feature = "std")]
use std::sync::OnceLock;
#[cfg(feature = "std")]
use unicase::UniCase;
/// Lazy-initialized HashMap for O(1) keyword lookups
#[cfg(feature = "std")]
static KEYWORD_MAP: OnceLock<HashMap<UniCase<&'static str>, Keyword>> = OnceLock::new();
/// Get the HashMap of keywords, initializing it on first access
#[cfg(feature = "std")]
fn get_keyword_map() -> &'static HashMap<UniCase<&'static str>, Keyword> {
KEYWORD_MAP.get_or_init(|| {
let mut map = HashMap::with_capacity(ALL_KEYWORDS.len());
for (keyword_str, keyword_enum) in ALL_KEYWORDS.iter().zip(ALL_KEYWORDS_INDEX.iter()) {
map.insert(UniCase::ascii(*keyword_str), *keyword_enum);
}
map
})
}
/// Look up a keyword by string, case-insensitively, with O(1) complexity
///
/// # Arguments
/// * `word` - The word to look up (case-insensitive)
///
/// # Returns
/// * `Some(Keyword)` if the word is a keyword
/// * `None` if the word is not a keyword
///
/// # Example
/// ```
/// use sqlparser::keywords::{get_keyword, Keyword};
///
/// assert_eq!(get_keyword("SELECT"), Some(Keyword::SELECT));
/// assert_eq!(get_keyword("select"), Some(Keyword::SELECT));
/// assert_eq!(get_keyword("my_table"), None);
/// ```
#[cfg(feature = "std")]
pub fn get_keyword(word: &str) -> Option<Keyword> {
get_keyword_map().get(&UniCase::ascii(word)).copied()
}
/// Fallback for no_std: use binary search (same as before)
#[cfg(not(feature = "std"))]
pub fn get_keyword(word: &str) -> Option<Keyword> {
ALL_KEYWORDS
.binary_search_by(|k| unicase::UniCase::ascii(k).cmp(&unicase::UniCase::ascii(&word)))
.ok()
.map(|idx| ALL_KEYWORDS_INDEX[idx])
}

View file

@ -14,6 +14,7 @@
#[cfg(not(feature = "std"))]
use alloc::{
borrow::Cow,
boxed::Box,
format,
string::{String, ToString},
@ -25,6 +26,8 @@ use core::{
str::FromStr,
};
use helpers::attached_token::AttachedToken;
#[cfg(feature = "std")]
use std::borrow::Cow;
use log::debug;
@ -1793,8 +1796,11 @@ impl<'a> Parser<'a> {
break;
}
BorrowedToken::SingleQuotedString(s) => {
let expr =
Expr::Identifier(Ident::with_quote_and_span('\'', next_token.span, s));
let expr = Expr::Identifier(Ident::with_quote_and_span(
'\'',
next_token.span,
s.as_ref(),
));
chain.push(AccessExpr::Dot(expr));
self.advance_token(); // The consumed string
}
@ -3893,7 +3899,7 @@ impl<'a> Parser<'a> {
// any keyword here unquoted.
keyword: _,
}) => Ok(JsonPathElem::Dot {
key: value,
key: value.to_string(),
quoted: quote_style.is_some(),
}),
@ -7744,7 +7750,9 @@ impl<'a> Parser<'a> {
if dialect_of!(self is HiveDialect) && self.parse_keyword(Keyword::COMMENT) {
let next_token = self.next_token();
match next_token.token {
BorrowedToken::SingleQuotedString(str) => Some(CommentDef::WithoutEq(str)),
BorrowedToken::SingleQuotedString(str) => {
Some(CommentDef::WithoutEq(str.into_owned()))
}
_ => self.expected("comment", next_token)?,
}
} else {
@ -7965,11 +7973,11 @@ impl<'a> Parser<'a> {
let comment = match (has_eq, value.token) {
(true, BorrowedToken::SingleQuotedString(s)) => {
Ok(Some(SqlOption::Comment(CommentDef::WithEq(s))))
}
(false, BorrowedToken::SingleQuotedString(s)) => {
Ok(Some(SqlOption::Comment(CommentDef::WithoutEq(s))))
Ok(Some(SqlOption::Comment(CommentDef::WithEq(s.into_owned()))))
}
(false, BorrowedToken::SingleQuotedString(s)) => Ok(Some(SqlOption::Comment(
CommentDef::WithoutEq(s.into_owned()),
))),
(_, token) => self.expected(
"BorrowedToken::SingleQuotedString",
TokenWithSpan::wrap(token),
@ -8014,8 +8022,8 @@ impl<'a> Parser<'a> {
let value = self.next_token();
let tablespace = match value.token {
BorrowedToken::Word(Word { value: name, .. })
| BorrowedToken::SingleQuotedString(name) => {
BorrowedToken::Word(Word { value: name, .. }) => {
let name = name.to_string();
let storage = match self.parse_keyword(Keyword::STORAGE) {
true => {
let _ = self.consume_token(&BorrowedToken::Eq);
@ -8038,6 +8046,28 @@ impl<'a> Parser<'a> {
storage,
})))
}
BorrowedToken::SingleQuotedString(name) => {
let storage = match self.parse_keyword(Keyword::STORAGE) {
true => {
let _ = self.consume_token(&BorrowedToken::Eq);
let storage_token = self.next_token();
match &storage_token.token {
BorrowedToken::Word(w) => match w.value.to_uppercase().as_str() {
"DISK" => Some(StorageType::Disk),
"MEMORY" => Some(StorageType::Memory),
_ => self.expected("DISK or MEMORY", storage_token)?,
},
_ => self.expected("BorrowedToken::Word", storage_token)?,
}
}
false => None,
};
Ok(Some(SqlOption::TableSpace(TablespaceOption {
name: name.into_owned(),
storage,
})))
}
_ => {
return self.expected("BorrowedToken::Word", value)?;
}
@ -8176,7 +8206,7 @@ impl<'a> Parser<'a> {
pub fn parse_comment_value(&self) -> Result<String, ParserError> {
let next_token = self.next_token();
let value = match next_token.token {
BorrowedToken::SingleQuotedString(str) => str,
BorrowedToken::SingleQuotedString(str) => str.into_owned(),
BorrowedToken::DollarQuotedString(str) => str.value,
_ => self.expected("string literal", next_token)?,
};
@ -10381,8 +10411,8 @@ impl<'a> Parser<'a> {
}
Keyword::NULL => ok_value(Value::Null),
Keyword::NoKeyword if w.quote_style.is_some() => match w.quote_style {
Some('"') => ok_value(Value::DoubleQuotedString(w.value)),
Some('\'') => ok_value(Value::SingleQuotedString(w.value)),
Some('"') => ok_value(Value::DoubleQuotedString(w.value.into_owned())),
Some('\'') => ok_value(Value::SingleQuotedString(w.value.into_owned())),
_ => self.expected(
"A value?",
TokenWithSpan {
@ -10484,11 +10514,18 @@ impl<'a> Parser<'a> {
fn maybe_concat_string_literal(&self, mut str: String) -> String {
if self.dialect.supports_string_literal_concatenation() {
while let BorrowedToken::SingleQuotedString(ref s)
| BorrowedToken::DoubleQuotedString(ref s) = self.peek_token_ref().token
{
str.push_str(s.clone().as_str());
self.advance_token();
loop {
match &self.peek_token_ref().token {
BorrowedToken::SingleQuotedString(s) => {
str.push_str(s.as_ref());
self.advance_token();
}
BorrowedToken::DoubleQuotedString(s) => {
str.push_str(s);
self.advance_token();
}
_ => break,
}
}
}
str
@ -10584,8 +10621,8 @@ impl<'a> Parser<'a> {
value,
keyword: Keyword::NoKeyword,
..
}) => Ok(value),
BorrowedToken::SingleQuotedString(s) => Ok(s),
}) => Ok(value.into_owned()),
BorrowedToken::SingleQuotedString(s) => Ok(s.into_owned()),
BorrowedToken::DoubleQuotedString(s) => Ok(s),
BorrowedToken::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
Ok(s)
@ -11100,7 +11137,7 @@ impl<'a> Parser<'a> {
loop {
let next_token = self.next_token();
match next_token.token {
BorrowedToken::SingleQuotedString(value) => values.push(value),
BorrowedToken::SingleQuotedString(value) => values.push(value.into_owned()),
_ => self.expected("a string", next_token)?,
}
let next_token = self.next_token();
@ -12125,7 +12162,7 @@ impl<'a> Parser<'a> {
match next_token.token {
BorrowedToken::Word(w) => modifiers.push(w.to_string()),
BorrowedToken::Number(n, _) => modifiers.push(n),
BorrowedToken::SingleQuotedString(s) => modifiers.push(s),
BorrowedToken::SingleQuotedString(s) => modifiers.push(s.into_owned()),
BorrowedToken::Comma => {
continue;
@ -13261,7 +13298,7 @@ impl<'a> Parser<'a> {
if token2 == BorrowedToken::Period {
match token1.token {
BorrowedToken::Word(w) => {
schema_name = w.value;
schema_name = w.value.to_string();
}
_ => {
return self.expected("Schema name", token1);
@ -13269,7 +13306,7 @@ impl<'a> Parser<'a> {
}
match token3.token {
BorrowedToken::Word(w) => {
table_name = w.value;
table_name = w.value.to_string();
}
_ => {
return self.expected("Table name", token3);
@ -13282,7 +13319,7 @@ impl<'a> Parser<'a> {
} else {
match token1.token {
BorrowedToken::Word(w) => {
table_name = w.value;
table_name = w.value.to_string();
}
_ => {
return self.expected("Table name", token1);
@ -14408,7 +14445,9 @@ impl<'a> Parser<'a> {
None => {
let next_token = self.next_token();
if let BorrowedToken::Word(w) = next_token.token {
Expr::Value(Value::Placeholder(w.value).with_span(next_token.span))
Expr::Value(
Value::Placeholder(w.value.into_owned()).with_span(next_token.span),
)
} else {
return parser_err!(
"Expecting number or byte length e.g. 100M",
@ -14962,7 +15001,7 @@ impl<'a> Parser<'a> {
let r#type = self.parse_data_type()?;
let path = if let BorrowedToken::SingleQuotedString(path) = self.peek_token().token {
self.next_token();
Some(path)
Some(path.into_owned())
} else {
None
};
@ -16491,7 +16530,7 @@ impl<'a> Parser<'a> {
let opt_ilike = if self.parse_keyword(Keyword::ILIKE) {
let next_token = self.next_token();
let pattern = match next_token.token {
BorrowedToken::SingleQuotedString(s) => s,
BorrowedToken::SingleQuotedString(s) => s.into_owned(),
_ => return self.expected("ilike pattern", next_token),
};
Some(IlikeSelectItem { pattern })
@ -17128,7 +17167,11 @@ impl<'a> Parser<'a> {
(true, _) => BorrowedToken::RParen,
(false, BorrowedToken::EOF) => BorrowedToken::EOF,
(false, BorrowedToken::Word(w)) if end_kws.contains(&w.keyword) => {
BorrowedToken::Word(w)
BorrowedToken::Word(Word {
value: Cow::Owned(w.value.into_owned()),
quote_style: w.quote_style,
keyword: w.keyword,
})
}
(false, _) => BorrowedToken::SemiColon,
};
@ -18327,27 +18370,27 @@ impl<'a> Parser<'a> {
self.expect_token(&BorrowedToken::Eq)?;
match self.peek_token().token {
BorrowedToken::SingleQuotedString(_) => Ok(KeyValueOption {
option_name: key.value.clone(),
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::Single(self.parse_value()?.into()),
}),
BorrowedToken::Word(word)
if word.keyword == Keyword::TRUE || word.keyword == Keyword::FALSE =>
{
Ok(KeyValueOption {
option_name: key.value.clone(),
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::Single(self.parse_value()?.into()),
})
}
BorrowedToken::Number(..) => Ok(KeyValueOption {
option_name: key.value.clone(),
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::Single(self.parse_value()?.into()),
}),
BorrowedToken::Word(word) => {
self.next_token();
Ok(KeyValueOption {
option_name: key.value.clone(),
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::Single(Value::Placeholder(
word.value.clone(),
word.value.to_string(),
)),
})
}
@ -18365,12 +18408,12 @@ impl<'a> Parser<'a> {
Some(values) => {
let values = values.into_iter().map(|v| v.value).collect();
Ok(KeyValueOption {
option_name: key.value.clone(),
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::Multi(values),
})
}
None => Ok(KeyValueOption {
option_name: key.value.clone(),
option_name: key.value.to_string(),
option_value: KeyValueOptionKind::KeyValueOptions(Box::new(
self.parse_key_value_options(true, &[])?,
)),
@ -18405,11 +18448,11 @@ fn maybe_prefixed_expr(expr: Expr, prefix: Option<Ident>) -> Expr {
}
}
impl Word {
impl Word<'_> {
#[deprecated(since = "0.54.0", note = "please use `into_ident` instead")]
pub fn to_ident(&self, span: Span) -> Ident {
Ident {
value: self.value.clone(),
value: self.value.to_string(),
quote_style: self.quote_style,
span,
}
@ -18418,7 +18461,7 @@ impl Word {
/// Convert this word into an [`Ident`] identifier
pub fn into_ident(self, span: Span) -> Ident {
Ident {
value: self.value,
value: self.value.into_owned(),
quote_style: self.quote_style,
span,
}

View file

@ -23,7 +23,7 @@
#[cfg(not(feature = "std"))]
use alloc::{
borrow::{Cow, ToOwned},
borrow::Cow,
format,
string::{String, ToString},
vec,
@ -48,7 +48,7 @@ use crate::dialect::{
BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
SnowflakeDialect,
};
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
use crate::keywords::Keyword;
use crate::{ast::DollarQuotedString, dialect::HiveDialect};
/// SQL Token enumeration with lifetime parameter for future zero-copy support
@ -59,13 +59,13 @@ pub enum BorrowedToken<'a> {
/// An end-of-file marker, not a real token
EOF,
/// A keyword (like SELECT) or an optionally quoted SQL identifier
Word(Word),
Word(Word<'a>),
/// An unsigned numeric literal
Number(String, bool),
/// A character that could not be tokenized
Char(char),
/// Single quoted string: i.e: 'string'
SingleQuotedString(String),
SingleQuotedString(Cow<'a, str>),
/// Double quoted string: i.e: "string"
DoubleQuotedString(String),
/// Triple single quoted strings: Example '''abc'''
@ -110,7 +110,7 @@ pub enum BorrowedToken<'a> {
/// Comma
Comma,
/// Whitespace (space, tab, etc)
Whitespace(Whitespace),
Whitespace(Whitespace<'a>),
/// Double equals sign `==`
DoubleEq,
/// Equality operator `=`
@ -280,8 +280,6 @@ pub enum BorrowedToken<'a> {
/// This is used to represent any custom binary operator that is not part of the SQL standard.
/// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
CustomBinaryOperator(String),
/// Marker to carry the lifetime parameter (never constructed)
_Phantom(Cow<'a, str>),
}
/// Type alias for backward compatibility - Token without explicit lifetime uses 'static
@ -399,7 +397,6 @@ impl<'a> fmt::Display for BorrowedToken<'a> {
BorrowedToken::QuestionAnd => write!(f, "?&"),
BorrowedToken::QuestionPipe => write!(f, "?|"),
BorrowedToken::CustomBinaryOperator(s) => f.write_str(s),
BorrowedToken::_Phantom(_) => unreachable!("_Phantom should never be constructed"),
}
}
}
@ -409,10 +406,16 @@ impl<'a> BorrowedToken<'a> {
pub fn to_static(self) -> Token {
match self {
BorrowedToken::EOF => BorrowedToken::EOF,
BorrowedToken::Word(w) => BorrowedToken::Word(w),
BorrowedToken::Word(w) => BorrowedToken::Word(Word {
value: Cow::Owned(w.value.into_owned()),
quote_style: w.quote_style,
keyword: w.keyword,
}),
BorrowedToken::Number(n, l) => BorrowedToken::Number(n, l),
BorrowedToken::Char(c) => BorrowedToken::Char(c),
BorrowedToken::SingleQuotedString(s) => BorrowedToken::SingleQuotedString(s),
BorrowedToken::SingleQuotedString(s) => {
BorrowedToken::SingleQuotedString(Cow::Owned(s.into_owned()))
}
BorrowedToken::DoubleQuotedString(s) => BorrowedToken::DoubleQuotedString(s),
BorrowedToken::TripleSingleQuotedString(s) => {
BorrowedToken::TripleSingleQuotedString(s)
@ -450,7 +453,20 @@ impl<'a> BorrowedToken<'a> {
BorrowedToken::UnicodeStringLiteral(s) => BorrowedToken::UnicodeStringLiteral(s),
BorrowedToken::HexStringLiteral(s) => BorrowedToken::HexStringLiteral(s),
BorrowedToken::Comma => BorrowedToken::Comma,
BorrowedToken::Whitespace(ws) => BorrowedToken::Whitespace(ws),
BorrowedToken::Whitespace(ws) => BorrowedToken::Whitespace(match ws {
Whitespace::Space => Whitespace::Space,
Whitespace::Newline => Whitespace::Newline,
Whitespace::Tab => Whitespace::Tab,
Whitespace::SingleLineComment { comment, prefix } => {
Whitespace::SingleLineComment {
comment: Cow::Owned(comment.into_owned()),
prefix: Cow::Owned(prefix.into_owned()),
}
}
Whitespace::MultiLineComment(s) => {
Whitespace::MultiLineComment(Cow::Owned(s.into_owned()))
}
}),
BorrowedToken::DoubleEq => BorrowedToken::DoubleEq,
BorrowedToken::Eq => BorrowedToken::Eq,
BorrowedToken::Neq => BorrowedToken::Neq,
@ -545,7 +561,6 @@ impl<'a> BorrowedToken<'a> {
BorrowedToken::QuestionAnd => BorrowedToken::QuestionAnd,
BorrowedToken::QuestionPipe => BorrowedToken::QuestionPipe,
BorrowedToken::CustomBinaryOperator(s) => BorrowedToken::CustomBinaryOperator(s),
BorrowedToken::_Phantom(_) => unreachable!("_Phantom should never be constructed"),
}
}
}
@ -556,13 +571,26 @@ impl BorrowedToken<'static> {
}
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
let word_uppercase = word.to_uppercase();
BorrowedToken::Word(Word {
value: word.to_string(),
value: Cow::Owned(word.to_string()),
quote_style,
keyword: if quote_style.is_none() {
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
crate::keywords::get_keyword(word).unwrap_or(Keyword::NoKeyword)
} else {
Keyword::NoKeyword
},
})
}
}
impl<'a> BorrowedToken<'a> {
/// Create a Word token with a borrowed string (zero-copy)
pub fn make_word_borrowed(word: &'a str, quote_style: Option<char>) -> Self {
BorrowedToken::Word(Word {
value: Cow::Borrowed(word),
quote_style,
keyword: if quote_style.is_none() {
crate::keywords::get_keyword(word).unwrap_or(Keyword::NoKeyword)
} else {
Keyword::NoKeyword
},
@ -574,10 +602,10 @@ impl BorrowedToken<'static> {
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct Word {
pub struct Word<'a> {
/// The value of the token, without the enclosing quotes, and with the
/// escape sequences (if any) processed (TODO: escapes are not handled)
pub value: String,
pub value: Cow<'a, str>,
/// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
/// The standard and most implementations allow using double quotes for this,
/// but some implementations support other quoting styles as well (e.g. \[MS SQL])
@ -587,7 +615,7 @@ pub struct Word {
pub keyword: Keyword,
}
impl fmt::Display for Word {
impl fmt::Display for Word<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.quote_style {
Some(s) if s == '"' || s == '[' || s == '`' => {
@ -599,7 +627,7 @@ impl fmt::Display for Word {
}
}
impl Word {
impl Word<'_> {
fn matching_end_quote(ch: char) -> char {
match ch {
'"' => '"', // ANSI and most dialects
@ -613,15 +641,18 @@ impl Word {
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum Whitespace {
pub enum Whitespace<'a> {
Space,
Newline,
Tab,
SingleLineComment { comment: String, prefix: String },
MultiLineComment(String),
SingleLineComment {
comment: Cow<'a, str>,
prefix: Cow<'a, str>,
},
MultiLineComment(Cow<'a, str>),
}
impl fmt::Display for Whitespace {
impl fmt::Display for Whitespace<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Whitespace::Space => f.write_str(" "),
@ -1016,7 +1047,7 @@ impl<'a> Tokenizer<'a> {
/// assert_eq!(tokens, vec![
/// Token::make_word("SELECT", None),
/// Token::Whitespace(Whitespace::Space),
/// Token::SingleQuotedString("foo".to_string()),
/// Token::SingleQuotedString("foo".to_string().into()),
/// ]);
pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
Self {
@ -1117,15 +1148,18 @@ impl<'a> Tokenizer<'a> {
&self,
consumed_byte_len: usize,
chars: &mut State<'a>,
) -> Result<Option<Token>, TokenizerError> {
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
chars.next(); // consume the first char
let word = self.tokenize_word(consumed_byte_len, chars)?;
// Calculate where the first character started
let first_char_byte_pos = chars.byte_pos.saturating_sub(consumed_byte_len);
let word = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
// TODO: implement parsing of exponent here
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
let mut inner_state = State {
peekable: word.chars().peekable(),
source: &word,
source: word,
line: 0,
col: 0,
byte_pos: 0,
@ -1136,7 +1170,7 @@ impl<'a> Tokenizer<'a> {
return Ok(Some(Token::Number(s, false)));
}
Ok(Some(Token::make_word(&word, None)))
Ok(Some(BorrowedToken::make_word_borrowed(word, None)))
}
/// Get the next token or return None
@ -1144,7 +1178,7 @@ impl<'a> Tokenizer<'a> {
&self,
chars: &mut State<'a>,
prev_token: Option<&BorrowedToken<'a>>,
) -> Result<Option<Token>, TokenizerError> {
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
match chars.peek() {
Some(&ch) => match ch {
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@ -1166,12 +1200,12 @@ impl<'a> Tokenizer<'a> {
Some('\'') => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
.tokenize_single_or_triple_quoted_string::<fn(String) -> BorrowedToken<'a>>(
chars,
'\'',
false,
Token::SingleQuotedByteStringLiteral,
Token::TripleSingleQuotedByteStringLiteral,
BorrowedToken::SingleQuotedByteStringLiteral,
BorrowedToken::TripleSingleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
@ -1180,12 +1214,12 @@ impl<'a> Tokenizer<'a> {
Some('\"') => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
.tokenize_single_or_triple_quoted_string::<fn(String) -> BorrowedToken<'a>>(
chars,
'"',
false,
Token::DoubleQuotedByteStringLiteral,
Token::TripleDoubleQuotedByteStringLiteral,
BorrowedToken::DoubleQuotedByteStringLiteral,
BorrowedToken::TripleDoubleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
@ -1193,8 +1227,9 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "b" or "B"
let s = self.tokenize_word(b.len_utf8(), chars)?;
Ok(Some(Token::make_word(&s, None)))
let first_char_byte_pos = chars.byte_pos.saturating_sub(b.len_utf8());
let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
Ok(Some(BorrowedToken::make_word_borrowed(s, None)))
}
}
}
@ -1203,25 +1238,26 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
.tokenize_single_or_triple_quoted_string::<fn(String) -> BorrowedToken<'a>>(
chars,
'\'',
false,
Token::SingleQuotedRawStringLiteral,
Token::TripleSingleQuotedRawStringLiteral,
BorrowedToken::SingleQuotedRawStringLiteral,
BorrowedToken::TripleSingleQuotedRawStringLiteral,
),
Some('\"') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
.tokenize_single_or_triple_quoted_string::<fn(String) -> BorrowedToken<'a>>(
chars,
'"',
false,
Token::DoubleQuotedRawStringLiteral,
Token::TripleDoubleQuotedRawStringLiteral,
BorrowedToken::DoubleQuotedRawStringLiteral,
BorrowedToken::TripleDoubleQuotedRawStringLiteral,
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b.len_utf8(), chars)?;
Ok(Some(Token::make_word(&s, None)))
let first_char_byte_pos = chars.byte_pos.saturating_sub(b.len_utf8());
let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
Ok(Some(BorrowedToken::make_word_borrowed(s, None)))
}
}
}
@ -1239,8 +1275,9 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "N"
let s = self.tokenize_word(n.len_utf8(), chars)?;
Ok(Some(Token::make_word(&s, None)))
let first_char_byte_pos = chars.byte_pos.saturating_sub(n.len_utf8());
let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
Ok(Some(BorrowedToken::make_word_borrowed(s, None)))
}
}
}
@ -1256,8 +1293,9 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "E" or "e"
let s = self.tokenize_word(x.len_utf8(), chars)?;
Ok(Some(Token::make_word(&s, None)))
let first_char_byte_pos = chars.byte_pos.saturating_sub(x.len_utf8());
let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
Ok(Some(BorrowedToken::make_word_borrowed(s, None)))
}
}
}
@ -1275,8 +1313,9 @@ impl<'a> Tokenizer<'a> {
}
}
// regular identifier starting with an "U" or "u"
let s = self.tokenize_word(x.len_utf8(), chars)?;
Ok(Some(Token::make_word(&s, None)))
let first_char_byte_pos = chars.byte_pos.saturating_sub(x.len_utf8());
let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
Ok(Some(BorrowedToken::make_word_borrowed(s, None)))
}
// The spec only allows an uppercase 'X' to introduce a hex
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
@ -1290,8 +1329,9 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "X"
let s = self.tokenize_word(x.len_utf8(), chars)?;
Ok(Some(Token::make_word(&s, None)))
let first_char_byte_pos = chars.byte_pos.saturating_sub(x.len_utf8());
let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?;
Ok(Some(BorrowedToken::make_word_borrowed(s, None)))
}
}
}
@ -1299,21 +1339,21 @@ impl<'a> Tokenizer<'a> {
'\'' => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
.tokenize_single_or_triple_quoted_string::<fn(String) -> BorrowedToken<'a>>(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
Token::SingleQuotedString,
Token::TripleSingleQuotedString,
|s| BorrowedToken::SingleQuotedString(Cow::Owned(s)),
BorrowedToken::TripleSingleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
let s = self.tokenize_single_quoted_string_borrowed(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::SingleQuotedString(s)))
Ok(Some(BorrowedToken::SingleQuotedString(s)))
}
// double quoted string
'\"' if !self.dialect.is_delimited_identifier_start(ch)
@ -1321,12 +1361,12 @@ impl<'a> Tokenizer<'a> {
{
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
.tokenize_single_or_triple_quoted_string::<fn(String) -> BorrowedToken<'a>>(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
Token::DoubleQuotedString,
Token::TripleDoubleQuotedString,
BorrowedToken::DoubleQuotedString,
BorrowedToken::TripleDoubleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
@ -1536,11 +1576,11 @@ impl<'a> Tokenizer<'a> {
if is_comment {
chars.next(); // consume second '-'
let comment = self.tokenize_single_line_comment(chars)?;
return Ok(Some(Token::Whitespace(
let comment = self.tokenize_single_line_comment_borrowed(chars)?;
return Ok(Some(BorrowedToken::Whitespace(
Whitespace::SingleLineComment {
prefix: "--".to_owned(),
comment,
prefix: Cow::Borrowed("--"),
comment: Cow::Borrowed(comment),
},
)));
}
@ -1567,11 +1607,13 @@ impl<'a> Tokenizer<'a> {
}
Some('/') if dialect_of!(self is SnowflakeDialect) => {
chars.next(); // consume the second '/', starting a snowflake single-line comment
let comment = self.tokenize_single_line_comment(chars)?;
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "//".to_owned(),
comment,
})))
let comment = self.tokenize_single_line_comment_borrowed(chars)?;
Ok(Some(BorrowedToken::Whitespace(
Whitespace::SingleLineComment {
prefix: Cow::Borrowed("//"),
comment: Cow::Borrowed(comment),
},
)))
}
Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
self.consume_and_return(chars, Token::DuckIntDiv)
@ -1773,11 +1815,13 @@ impl<'a> Tokenizer<'a> {
'#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
{
chars.next(); // consume the '#', starting a snowflake single-line comment
let comment = self.tokenize_single_line_comment(chars)?;
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "#".to_owned(),
comment,
})))
let comment = self.tokenize_single_line_comment_borrowed(chars)?;
Ok(Some(BorrowedToken::Whitespace(
Whitespace::SingleLineComment {
prefix: Cow::Borrowed("#"),
comment: Cow::Borrowed(comment),
},
)))
}
'~' => {
chars.next(); // consume
@ -1923,10 +1967,10 @@ impl<'a> Tokenizer<'a> {
/// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
fn consume_for_binop(
&self,
chars: &mut State,
chars: &mut State<'a>,
prefix: &str,
default: Token,
) -> Result<Option<Token>, TokenizerError> {
default: BorrowedToken<'a>,
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
chars.next(); // consume the first char
self.start_binop_opt(chars, prefix, Some(default))
}
@ -1934,20 +1978,20 @@ impl<'a> Tokenizer<'a> {
/// parse a custom binary operator
fn start_binop(
&self,
chars: &mut State,
chars: &mut State<'a>,
prefix: &str,
default: Token,
) -> Result<Option<Token>, TokenizerError> {
default: BorrowedToken<'a>,
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
self.start_binop_opt(chars, prefix, Some(default))
}
/// parse a custom binary operator
fn start_binop_opt(
&self,
chars: &mut State,
chars: &mut State<'a>,
prefix: &str,
default: Option<Token>,
) -> Result<Option<Token>, TokenizerError> {
default: Option<BorrowedToken<'a>>,
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
let mut custom = None;
while let Some(&ch) = chars.peek() {
if !self.dialect.is_custom_operator_part(ch) {
@ -2132,16 +2176,6 @@ impl<'a> Tokenizer<'a> {
})
}
// Consume characters until newline
fn tokenize_single_line_comment(
&self,
chars: &mut State<'a>,
) -> Result<String, TokenizerError> {
Ok(self
.tokenize_single_line_comment_borrowed(chars)?
.to_string())
}
/// Tokenize a single-line comment, returning a borrowed slice.
/// Returns a slice that includes the terminating newline character.
fn tokenize_single_line_comment_borrowed(
@ -2167,29 +2201,6 @@ impl<'a> Tokenizer<'a> {
self.safe_slice(chars.source, start_pos, chars.byte_pos, error_loc)
}
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
/// `consumed_byte_len` is the byte length of the consumed character(s).
fn tokenize_word(
&self,
consumed_byte_len: usize,
chars: &mut State<'a>,
) -> Result<String, TokenizerError> {
let error_loc = chars.location();
// Overflow check: ensure we can safely subtract
if consumed_byte_len > chars.byte_pos {
return self.tokenizer_error(error_loc, "Invalid byte position in tokenize_word");
}
// Calculate where the first character started
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
// Use the zero-copy version and convert to String
Ok(self
.tokenize_word_borrowed(first_char_byte_pos, chars)?
.to_string())
}
/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
/// The first character position must be provided (before it was consumed).
/// Returns a slice with the same lifetime as the State's source.
@ -2245,14 +2256,14 @@ impl<'a> Tokenizer<'a> {
/// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
fn tokenize_single_or_triple_quoted_string<F>(
&self,
chars: &mut State,
chars: &mut State<'a>,
quote_style: char,
backslash_escape: bool,
single_quote_token: F,
triple_quote_token: F,
) -> Result<Option<Token>, TokenizerError>
) -> Result<Option<BorrowedToken<'a>>, TokenizerError>
where
F: Fn(String) -> Token,
F: Fn(String) -> BorrowedToken<'a>,
{
let error_loc = chars.location();
@ -2316,6 +2327,79 @@ impl<'a> Tokenizer<'a> {
)
}
/// Reads a string literal quoted by a single quote character, returning Cow for zero-copy.
/// Returns Cow::Borrowed when the string has no escape sequences or doubled quotes,
/// Cow::Owned when processing is required.
fn tokenize_single_quoted_string_borrowed(
&self,
chars: &mut State<'a>,
quote_style: char,
backslash_escape: bool,
) -> Result<Cow<'a, str>, TokenizerError> {
let start_byte_pos = chars.byte_pos;
let error_loc = chars.location();
// Consume opening quote
if chars.next() != Some(quote_style) {
return self.tokenizer_error(error_loc, "Expected opening quote");
}
let content_start = chars.byte_pos;
let mut needs_processing = false;
// Scan the string to detect if processing is needed
loop {
match chars.peek() {
None => {
return self.tokenizer_error(error_loc, "Unterminated string literal");
}
Some(&ch) if ch == quote_style => {
// Found a quote - check if it's doubled or the end
let quote_pos = chars.byte_pos;
chars.next(); // consume quote
if chars.peek() == Some(&quote_style) {
// Doubled quote - needs processing
needs_processing = true;
chars.next(); // consume second quote
} else {
// End of string
if needs_processing {
// Reset and use the owned version
chars.byte_pos = start_byte_pos;
chars.line = error_loc.line;
chars.col = error_loc.column;
// Recreate peekable from current position
let remaining = &chars.source[start_byte_pos..];
chars.peekable = remaining.chars().peekable();
let s = self.tokenize_single_quoted_string(
chars,
quote_style,
backslash_escape,
)?;
return Ok(Cow::Owned(s));
} else {
// Can use borrowed slice (excluding quotes)
return Ok(Cow::Borrowed(&chars.source[content_start..quote_pos]));
}
}
}
Some(&'\\') if backslash_escape => {
// Escape sequence - needs processing
needs_processing = true;
chars.next(); // consume backslash
if chars.next().is_none() {
return self.tokenizer_error(error_loc, "Unterminated string literal");
}
}
Some(_) => {
chars.next(); // consume regular character
}
}
}
}
/// Read a quoted string.
fn tokenize_quoted_string(
&self,
@ -2426,11 +2510,11 @@ impl<'a> Tokenizer<'a> {
fn tokenize_multiline_comment(
&self,
chars: &mut State<'a>,
) -> Result<Option<Token>, TokenizerError> {
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
let s = self.tokenize_multiline_comment_borrowed(chars)?;
Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(
s.to_string(),
))))
Ok(Some(BorrowedToken::Whitespace(
Whitespace::MultiLineComment(Cow::Borrowed(s)),
)))
}
/// Tokenize a multi-line comment, returning a borrowed slice.
@ -2541,9 +2625,9 @@ impl<'a> Tokenizer<'a> {
#[allow(clippy::unnecessary_wraps)]
fn consume_and_return(
&self,
chars: &mut State,
t: Token,
) -> Result<Option<Token>, TokenizerError> {
chars: &mut State<'a>,
t: BorrowedToken<'a>,
) -> Result<Option<BorrowedToken<'a>>, TokenizerError> {
chars.next();
Ok(Some(t))
}
@ -3062,12 +3146,12 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "foo".to_string(),
value: "foo".to_string().into(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),
Token::DoubleEq,
Token::SingleQuotedString("1".to_string()),
Token::SingleQuotedString("1".to_string().into()),
];
compare(expected, tokens);
@ -3169,11 +3253,11 @@ mod tests {
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("a")),
Token::SingleQuotedString(String::from("a").into()),
Token::Whitespace(Whitespace::Space),
Token::StringConcat,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("b")),
Token::SingleQuotedString(String::from("b").into()),
];
compare(expected, tokens);
@ -3352,7 +3436,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Neq,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("Not Provided")),
Token::SingleQuotedString(String::from("Not Provided").into()),
];
compare(expected, tokens);
@ -3379,7 +3463,9 @@ mod tests {
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
let expected = vec![Token::SingleQuotedString(
"foo\r\nbar\nbaz".to_string().into(),
)];
compare(expected, tokens);
}
@ -3669,8 +3755,8 @@ mod tests {
vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment\n".to_string(),
prefix: "--".to_string().into(),
comment: "this is a comment\n".to_string().into(),
}),
Token::Number("1".to_string(), false),
],
@ -3680,8 +3766,8 @@ mod tests {
vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment\r1".to_string(),
prefix: "--".to_string().into(),
comment: "this is a comment\r1".to_string().into(),
}),
],
),
@ -3690,8 +3776,8 @@ mod tests {
vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment\r\n".to_string(),
prefix: "--".to_string().into(),
comment: "this is a comment\r\n".to_string().into(),
}),
Token::Number("1".to_string(), false),
],
@ -3715,8 +3801,8 @@ mod tests {
let expected = vec![
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "\r".to_string(),
prefix: "--".to_string().into(),
comment: "\r".to_string().into(),
}),
Token::Number("0".to_string(), false),
];
@ -3730,8 +3816,8 @@ mod tests {
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment".to_string(),
prefix: "--".to_string().into(),
comment: "this is a comment".to_string().into(),
})];
compare(expected, tokens);
}
@ -3745,7 +3831,7 @@ mod tests {
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* /comment".to_string(),
"multi-line\n* /comment".to_string().into(),
)),
Token::Number("1".to_string(), false),
];
@ -3764,7 +3850,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Div,
Token::Word(Word {
value: "comment".to_string(),
value: "comment".to_string().into(),
quote_style: None,
keyword: Keyword::COMMENT,
}),
@ -3791,7 +3877,9 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
Token::Whitespace(Whitespace::MultiLineComment(
" a /* b */ c ".to_string().into(),
)),
Token::Number("0".to_string(), false),
],
);
@ -3805,7 +3893,7 @@ mod tests {
Token::make_keyword("select"),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string().into())),
Token::Number("0".to_string(), false),
],
);
@ -3820,7 +3908,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"/* nested comment ".to_string(),
"/* nested comment ".to_string().into(),
)),
Token::Mul,
Token::Div,
@ -3837,7 +3925,9 @@ mod tests {
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
Token::Whitespace(Whitespace::MultiLineComment(
"* Comment *".to_string().into(),
)),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
@ -4221,14 +4311,16 @@ mod tests {
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
let expected = vec![Token::SingleQuotedString(expected.to_string().into())];
compare(expected, tokens);
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
let expected = vec![Token::SingleQuotedString(
expected_unescaped.to_string().into(),
)];
compare(expected, tokens);
}
@ -4245,7 +4337,7 @@ mod tests {
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
let expected = vec![Token::SingleQuotedString(expected.to_string().into())];
compare(expected, tokens);
}
@ -4255,7 +4347,7 @@ mod tests {
let dialect = MySqlDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
let expected = vec![Token::SingleQuotedString(expected.to_string().into())];
compare(expected, tokens);
}
@ -4358,7 +4450,7 @@ mod tests {
.unwrap();
let expected = vec![
Token::DoubleQuotedString("".to_string()),
Token::SingleQuotedString("".to_string()),
Token::SingleQuotedString("".to_string().into()),
];
compare(expected, tokens);
@ -4368,7 +4460,7 @@ mod tests {
.tokenize()
.unwrap();
let expected = vec![
Token::SingleQuotedString("".to_string()),
Token::SingleQuotedString("".to_string().into()),
Token::DoubleQuotedString("".to_string()),
];
compare(expected, tokens);
@ -4377,7 +4469,7 @@ mod tests {
let dialect = SnowflakeDialect {};
let sql = r#"''''''"#;
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString("''".to_string())];
let expected = vec![Token::SingleQuotedString("''".to_string().into())];
compare(expected, tokens);
}
@ -4409,7 +4501,7 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::AtSign,
Token::SingleQuotedString("1".to_string()),
Token::SingleQuotedString("1".to_string().into()),
];
compare(expected, tokens);
}
@ -4467,7 +4559,7 @@ mod tests {
Token::make_keyword("select"),
Token::Whitespace(Whitespace::Space),
Token::make_word("e", None),
Token::SingleQuotedString("...".to_string()),
Token::SingleQuotedString("...".to_string().into()),
],
);
@ -4477,7 +4569,7 @@ mod tests {
Token::make_keyword("select"),
Token::Whitespace(Whitespace::Space),
Token::make_word("E", None),
Token::SingleQuotedString("...".to_string()),
Token::SingleQuotedString("...".to_string().into()),
],
);
}
@ -4513,7 +4605,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Minus,
Token::Minus,
Token::SingleQuotedString("abc".to_string()),
Token::SingleQuotedString("abc".to_string().into()),
],
);
@ -4524,8 +4616,8 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: " 'abc'".to_string(),
prefix: "--".to_string().into(),
comment: " 'abc'".to_string().into(),
}),
],
);
@ -4551,8 +4643,8 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "'abc'".to_string(),
prefix: "--".to_string().into(),
comment: "'abc'".to_string().into(),
}),
],
);
@ -4564,8 +4656,8 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: " 'abc'".to_string(),
prefix: "--".to_string().into(),
comment: " 'abc'".to_string().into(),
}),
],
);
@ -4577,8 +4669,8 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "".to_string(),
prefix: "--".to_string().into(),
comment: "".to_string().into(),
}),
],
);
@ -4622,13 +4714,13 @@ mod tests {
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "table".to_string(),
value: "table".to_string().into(),
quote_style: None,
keyword: Keyword::TABLE,
}),
Token::Period,
Token::Word(Word {
value: "_col".to_string(),
value: "_col".to_string().into(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),

View file

@ -2629,7 +2629,7 @@ fn test_export_data() {
body: Box::new(SetExpr::Select(Box::new(Select {
select_token: AttachedToken(TokenWithSpan::new(
Token::Word(Word {
value: "SELECT".to_string(),
value: "SELECT".to_string().into(),
quote_style: None,
keyword: Keyword::SELECT,
}),
@ -2733,7 +2733,7 @@ fn test_export_data() {
body: Box::new(SetExpr::Select(Box::new(Select {
select_token: AttachedToken(TokenWithSpan::new(
Token::Word(Word {
value: "SELECT".to_string(),
value: "SELECT".to_string().into(),
quote_style: None,
keyword: Keyword::SELECT,
}),

View file

@ -1581,7 +1581,7 @@ fn test_mssql_while_statement() {
while_block: ConditionalStatementBlock {
start_token: AttachedToken(TokenWithSpan {
token: Token::Word(Word {
value: "WHILE".to_string(),
value: "WHILE".to_string().into(),
quote_style: None,
keyword: Keyword::WHILE
}),

View file

@ -566,8 +566,8 @@ fn test_snowflake_single_line_tokenize() {
Token::Whitespace(Whitespace::Space),
Token::make_keyword("TABLE"),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "#".to_string(),
comment: " this is a comment \n".to_string(),
prefix: "#".to_string().into(),
comment: " this is a comment \n".to_string().into(),
}),
Token::make_word("table_1", None),
];
@ -583,8 +583,8 @@ fn test_snowflake_single_line_tokenize() {
Token::make_keyword("TABLE"),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "//".to_string(),
comment: " this is a comment \n".to_string(),
prefix: "//".to_string().into(),
comment: " this is a comment \n".to_string().into(),
}),
Token::make_word("table_1", None),
];