diff --git a/.gitignore b/.gitignore index f705d0b0..c824499d 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ Cargo.lock *.swp -.DS_store \ No newline at end of file +.DS_store + +# dhat profiler output files +dhat*.json \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ed94bbbd..76584757 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ visitor = ["sqlparser_derive"] bigdecimal = { version = "0.4.1", features = ["serde"], optional = true } log = "0.4" recursive = { version = "0.1.1", optional = true} +unicase = "2.7" serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } # serde_json is only used in examples/cli, but we have to put it outside @@ -60,7 +61,12 @@ sqlparser_derive = { version = "0.4.0", path = "derive", optional = true } simple_logger = "5.0" matches = "0.1" pretty_assertions = "1" +sysinfo = "0.30" +dhat = "0.3.3" +criterion = "0.5" [package.metadata.docs.rs] # Document these features on docs.rs -features = ["serde", "visitor"] +features = ["serde", "visitor"] + + diff --git a/sqlparser_bench/Cargo.toml b/sqlparser_bench/Cargo.toml index 4fb9af16..5ab7d41a 100644 --- a/sqlparser_bench/Cargo.toml +++ b/sqlparser_bench/Cargo.toml @@ -31,3 +31,7 @@ criterion = "0.7" [[bench]] name = "sqlparser_bench" harness = false + +[[bench]] +name = "tokenize_bench" +harness = false diff --git a/sqlparser_bench/benches/tokenize_bench.rs b/sqlparser_bench/benches/tokenize_bench.rs new file mode 100644 index 00000000..19a1e04e --- /dev/null +++ b/sqlparser_bench/benches/tokenize_bench.rs @@ -0,0 +1,862 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark tokenization performance +//! +//! This benchmark measures tokenization speed using a complex SQL query +//! with many identifiers, keywords, string literals, and comments. + +use criterion::{criterion_group, criterion_main, Criterion}; +use sqlparser::dialect::GenericDialect; +use sqlparser::tokenizer::Tokenizer; + +const COMPLEX_SQL: &str = r#" + -- ============================================================================ + -- Enterprise Sales Analytics Dashboard Query + -- ============================================================================ + -- Purpose: Comprehensive sales analysis across multiple dimensions + -- Author: Analytics Team + -- Last Modified: 2024-01-15 + -- ============================================================================ + + /* + * This query aggregates sales data from multiple sources: + * - Customer transactions and lifetime value + * - Product performance across categories + * - Regional sales trends and patterns + * - Employee commission calculations + * - Inventory and fulfillment metrics + */ + + WITH customer_segments AS ( + -- Segment customers by purchase behavior and demographics + SELECT + customer_id, + customer_number, + customer_name, + customer_type, + customer_status, + customer_tier, + email_address, + phone_number, + mobile_number, + fax_number, + date_of_birth, + registration_date, + last_login_date, + account_status, + email_verified, + phone_verified, + -- Address information + billing_address_line1, + billing_address_line2, + billing_city, + billing_state, + billing_postal_code, + billing_country, + shipping_address_line1, + shipping_address_line2, + shipping_city, + shipping_state, + shipping_postal_code, + shipping_country, + -- Demographics + gender, + age_group, + income_bracket, + education_level, + occupation, + marital_status, + -- Marketing preferences + marketing_opt_in, + sms_opt_in, + email_frequency, + preferred_channel, + preferred_language, + -- Calculated fields + CASE + WHEN customer_status = 'active' AND last_login_date >= CURRENT_DATE - INTERVAL '30' DAY THEN 'highly_active' + WHEN customer_status = 'active' AND last_login_date >= CURRENT_DATE - INTERVAL '90' DAY THEN 'active' + WHEN customer_status = 'active' THEN 'inactive' + ELSE 'dormant' + END AS activity_level, + CASE + WHEN registration_date >= CURRENT_DATE - INTERVAL '1' YEAR THEN 'new' + WHEN registration_date >= CURRENT_DATE - INTERVAL '3' YEAR THEN 'established' + ELSE 'veteran' + END AS customer_tenure + FROM customers + WHERE customer_status IN ('active', 'pending', 'suspended') + AND registration_date >= '2020-01-01' + AND billing_country IN ('USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Spain', 'Italy') + AND email_address NOT LIKE '%@test.com' + AND email_address NOT LIKE '%@example.com' + AND customer_name IS NOT NULL + ), + + product_catalog AS ( + -- Product information with categories and attributes + SELECT + product_id, + product_sku, + product_name, + product_description, + product_category, + product_subcategory, + product_brand, + product_manufacturer, + product_supplier, + product_model, + product_series, + product_version, + -- Pricing + list_price, + cost_price, + sale_price, + wholesale_price, + minimum_price, + suggested_retail_price, + -- Attributes + product_color, + product_size, + product_weight, + product_length, + product_width, + product_height, + product_material, + product_warranty, + -- Inventory + stock_quantity, + reorder_level, + reorder_quantity, + warehouse_location, + bin_location, + aisle_number, + shelf_number, + -- Status + product_status, + availability_status, + is_featured, + is_new_arrival, + is_on_sale, + is_clearance, + is_discontinued, + launch_date, + discontinuation_date, + -- Ratings + average_rating, + review_count, + return_rate, + defect_rate, + -- Categories + CASE + WHEN product_category = 'electronics' THEN 'high_tech' + WHEN product_category IN ('clothing', 'shoes', 'accessories') THEN 'fashion' + WHEN product_category IN ('home', 'garden', 'furniture') THEN 'home_living' + WHEN product_category IN ('sports', 'outdoor', 'fitness') THEN 'active_lifestyle' + ELSE 'general_merchandise' + END AS category_group + FROM products + WHERE product_status = 'active' + AND availability_status IN ('in_stock', 'low_stock', 'backorder') + AND is_discontinued = FALSE + AND launch_date <= CURRENT_DATE + ), + + order_transactions AS ( + -- Order and transaction details + SELECT + order_id, + order_number, + order_date, + order_time, + order_timestamp, + customer_id, + order_status, + order_type, + order_channel, + order_source, + -- Payment information + payment_method, + payment_status, + payment_date, + payment_reference, + transaction_id, + authorization_code, + -- Financial details + subtotal_amount, + tax_amount, + shipping_amount, + discount_amount, + coupon_amount, + gift_card_amount, + total_amount, + paid_amount, + refund_amount, + net_amount, + -- Shipping details + shipping_method, + shipping_carrier, + tracking_number, + shipped_date, + estimated_delivery_date, + actual_delivery_date, + delivery_status, + signature_required, + -- Location + ship_to_address_line1, + ship_to_address_line2, + ship_to_city, + ship_to_state, + ship_to_postal_code, + ship_to_country, + -- Fulfillment + warehouse_id, + fulfillment_center, + picker_id, + packer_id, + shipper_id, + -- Timestamps + created_at, + updated_at, + completed_at, + cancelled_at, + -- Flags + is_gift, + is_rush_order, + is_international, + requires_signature, + is_business_order, + -- Notes + customer_notes, + internal_notes, + gift_message, + special_instructions + FROM orders + WHERE order_date >= '2023-01-01' + AND order_date < '2024-12-31' + AND order_status IN ('pending', 'processing', 'shipped', 'delivered', 'completed') + AND order_type IN ('standard', 'express', 'overnight', 'international') + AND total_amount > 0 + AND customer_id IS NOT NULL + ), + + order_line_items AS ( + -- Individual line items from orders + SELECT + line_item_id, + order_id, + product_id, + line_number, + -- Quantities + quantity_ordered, + quantity_shipped, + quantity_cancelled, + quantity_returned, + quantity_damaged, + -- Pricing + unit_price, + unit_cost, + unit_discount, + line_subtotal, + line_tax, + line_shipping, + line_total, + -- Discounts + discount_type, + discount_code, + discount_percentage, + discount_reason, + -- Product details at time of order + product_sku_snapshot, + product_name_snapshot, + product_category_snapshot, + -- Status + line_status, + fulfillment_status, + return_status, + -- Warehouse + picked_from_warehouse, + picked_from_location, + picked_by_user, + picked_at_timestamp, + packed_by_user, + packed_at_timestamp, + -- Returns + return_reason, + return_date, + refund_amount, + restocking_fee, + -- Gift wrap + is_gift_wrapped, + gift_wrap_type, + gift_wrap_charge, + -- Calculated fields + unit_price * quantity_ordered AS line_revenue, + unit_cost * quantity_ordered AS line_cost, + (unit_price - unit_cost) * quantity_ordered AS line_profit, + CASE + WHEN quantity_returned > 0 THEN 'returned' + WHEN quantity_cancelled > 0 THEN 'cancelled' + WHEN quantity_shipped = quantity_ordered THEN 'fulfilled' + ELSE 'partial' + END AS fulfillment_type + FROM order_items + WHERE line_status NOT IN ('cancelled', 'voided') + AND quantity_ordered > 0 + ), + + employee_data AS ( + -- Employee and sales representative information + SELECT + employee_id, + employee_number, + employee_name, + first_name, + last_name, + middle_name, + email_address, + phone_extension, + mobile_phone, + -- Employment details + hire_date, + termination_date, + employment_status, + employment_type, + job_title, + job_level, + job_grade, + department_id, + department_name, + division_id, + division_name, + -- Management + manager_id, + manager_name, + reports_to, + -- Location + office_location, + office_building, + office_floor, + office_room, + work_city, + work_state, + work_country, + -- Compensation + base_salary, + commission_rate, + bonus_target, + commission_tier, + -- Performance + sales_quota, + current_sales, + quota_attainment, + performance_rating, + last_review_date, + next_review_date + FROM employees + WHERE employment_status = 'active' + AND employee_id IS NOT NULL + AND hire_date <= CURRENT_DATE + ), + + customer_lifetime_metrics AS ( + -- Calculate customer lifetime value and metrics + SELECT + cs.customer_id, + cs.customer_name, + cs.customer_tier, + cs.activity_level, + -- Order counts + COUNT(DISTINCT ot.order_id) AS total_orders, + COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '30' DAY THEN ot.order_id END) AS orders_last_30_days, + COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '90' DAY THEN ot.order_id END) AS orders_last_90_days, + COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.order_id END) AS orders_last_year, + -- Revenue metrics + SUM(ot.total_amount) AS lifetime_revenue, + SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '30' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_30_days, + SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '90' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_90_days, + SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_year, + -- Average values + AVG(ot.total_amount) AS average_order_value, + AVG(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.total_amount END) AS avg_order_value_last_year, + -- Product metrics + COUNT(DISTINCT oli.product_id) AS unique_products_purchased, + SUM(oli.quantity_ordered) AS total_items_purchased, + -- Return metrics + SUM(oli.quantity_returned) AS total_items_returned, + SUM(CASE WHEN oli.quantity_returned > 0 THEN oli.refund_amount ELSE 0 END) AS total_refund_amount, + -- Date ranges + MIN(ot.order_date) AS first_order_date, + MAX(ot.order_date) AS last_order_date, + MAX(ot.order_date) - MIN(ot.order_date) AS customer_lifespan_days, + -- Recency + CURRENT_DATE - MAX(ot.order_date) AS days_since_last_order + FROM customer_segments cs + LEFT JOIN order_transactions ot ON cs.customer_id = ot.customer_id + LEFT JOIN order_line_items oli ON ot.order_id = oli.order_id + WHERE ot.order_status IN ('delivered', 'completed') + GROUP BY + cs.customer_id, + cs.customer_name, + cs.customer_tier, + cs.activity_level + ), + + product_performance AS ( + -- Product sales performance metrics + SELECT + pc.product_id, + pc.product_sku, + pc.product_name, + pc.product_category, + pc.product_subcategory, + pc.product_brand, + pc.category_group, + -- Sales metrics + COUNT(DISTINCT oli.order_id) AS total_orders, + SUM(oli.quantity_ordered) AS total_quantity_sold, + SUM(oli.quantity_returned) AS total_quantity_returned, + SUM(oli.line_revenue) AS total_revenue, + SUM(oli.line_cost) AS total_cost, + SUM(oli.line_profit) AS total_profit, + -- Averages + AVG(oli.unit_price) AS average_selling_price, + AVG(oli.line_revenue) AS average_line_revenue, + -- Return rate + CAST(SUM(oli.quantity_returned) AS DECIMAL) / NULLIF(SUM(oli.quantity_ordered), 0) AS return_rate, + -- Profit margin + CAST(SUM(oli.line_profit) AS DECIMAL) / NULLIF(SUM(oli.line_revenue), 0) AS profit_margin, + -- Rankings + RANK() OVER (PARTITION BY pc.product_category ORDER BY SUM(oli.line_revenue) DESC) AS revenue_rank_in_category, + RANK() OVER (ORDER BY SUM(oli.quantity_ordered) DESC) AS quantity_rank_overall + FROM product_catalog pc + INNER JOIN order_line_items oli ON pc.product_id = oli.product_id + INNER JOIN order_transactions ot ON oli.order_id = ot.order_id + WHERE ot.order_status IN ('delivered', 'completed') + AND ot.order_date >= '2023-01-01' + GROUP BY + pc.product_id, + pc.product_sku, + pc.product_name, + pc.product_category, + pc.product_subcategory, + pc.product_brand, + pc.category_group + ), + + regional_sales AS ( + -- Sales performance by region + SELECT + cs.billing_country, + cs.billing_state, + cs.billing_city, + -- Order metrics + COUNT(DISTINCT ot.order_id) AS total_orders, + COUNT(DISTINCT cs.customer_id) AS unique_customers, + -- Revenue + SUM(ot.total_amount) AS total_revenue, + SUM(ot.shipping_amount) AS total_shipping_revenue, + SUM(ot.tax_amount) AS total_tax_collected, + AVG(ot.total_amount) AS average_order_value, + -- Time periods + SUM(CASE WHEN ot.order_date >= '2024-01-01' THEN ot.total_amount ELSE 0 END) AS revenue_2024, + SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END) AS revenue_2023, + -- Growth + (SUM(CASE WHEN ot.order_date >= '2024-01-01' THEN ot.total_amount ELSE 0 END) - + SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END)) / + NULLIF(SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END), 0) AS year_over_year_growth + FROM customer_segments cs + INNER JOIN order_transactions ot ON cs.customer_id = ot.customer_id + WHERE ot.order_status IN ('delivered', 'completed') + GROUP BY + cs.billing_country, + cs.billing_state, + cs.billing_city + HAVING SUM(ot.total_amount) > 1000 + ), + + monthly_trends AS ( + -- Monthly sales trends and seasonality + SELECT + DATE_TRUNC('month', ot.order_date) AS order_month, + EXTRACT(YEAR FROM ot.order_date) AS order_year, + EXTRACT(MONTH FROM ot.order_date) AS month_number, + EXTRACT(QUARTER FROM ot.order_date) AS quarter_number, + -- Volume metrics + COUNT(DISTINCT ot.order_id) AS orders, + COUNT(DISTINCT ot.customer_id) AS customers, + SUM(oli.quantity_ordered) AS items_sold, + -- Financial metrics + SUM(ot.subtotal_amount) AS subtotal, + SUM(ot.tax_amount) AS tax, + SUM(ot.shipping_amount) AS shipping, + SUM(ot.discount_amount) AS discounts, + SUM(ot.total_amount) AS revenue, + -- Averages + AVG(ot.total_amount) AS avg_order_value, + AVG(oli.quantity_ordered) AS avg_items_per_order, + -- Moving averages + AVG(SUM(ot.total_amount)) OVER (ORDER BY DATE_TRUNC('month', ot.order_date) ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS three_month_moving_avg, + AVG(SUM(ot.total_amount)) OVER (ORDER BY DATE_TRUNC('month', ot.order_date) ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS six_month_moving_avg + FROM order_transactions ot + INNER JOIN order_line_items oli ON ot.order_id = oli.order_id + WHERE ot.order_status IN ('delivered', 'completed') + AND ot.order_date >= '2022-01-01' + GROUP BY + DATE_TRUNC('month', ot.order_date), + EXTRACT(YEAR FROM ot.order_date), + EXTRACT(MONTH FROM ot.order_date), + EXTRACT(QUARTER FROM ot.order_date) + ), + + category_analysis AS ( + -- Category performance analysis + SELECT + pc.product_category, + pc.product_subcategory, + pc.category_group, + -- Sales + COUNT(DISTINCT oli.order_id) AS orders, + SUM(oli.quantity_ordered) AS quantity, + SUM(oli.line_revenue) AS revenue, + SUM(oli.line_profit) AS profit, + -- Market share + SUM(oli.line_revenue) / SUM(SUM(oli.line_revenue)) OVER () AS revenue_share, + -- Pricing + AVG(oli.unit_price) AS avg_price, + MIN(oli.unit_price) AS min_price, + MAX(oli.unit_price) AS max_price, + -- Profitability + SUM(oli.line_profit) / NULLIF(SUM(oli.line_revenue), 0) AS profit_margin, + -- Returns + SUM(oli.quantity_returned) AS returns, + CAST(SUM(oli.quantity_returned) AS DECIMAL) / NULLIF(SUM(oli.quantity_ordered), 0) AS return_rate + FROM product_catalog pc + INNER JOIN order_line_items oli ON pc.product_id = oli.product_id + INNER JOIN order_transactions ot ON oli.order_id = ot.order_id + WHERE ot.order_status IN ('delivered', 'completed') + GROUP BY + pc.product_category, + pc.product_subcategory, + pc.category_group + ) + + -- Main query combining all CTEs + SELECT + -- Customer information + cs.customer_id, + cs.customer_number, + cs.customer_name, + cs.customer_type, + cs.customer_tier, + cs.activity_level, + cs.customer_tenure, + cs.email_address, + cs.phone_number, + cs.billing_city, + cs.billing_state, + cs.billing_country, + cs.age_group, + cs.gender, + cs.income_bracket, + -- Customer metrics + clm.total_orders, + clm.orders_last_30_days, + clm.orders_last_90_days, + clm.orders_last_year, + clm.lifetime_revenue, + clm.revenue_last_30_days, + clm.revenue_last_90_days, + clm.revenue_last_year, + clm.average_order_value, + clm.unique_products_purchased, + clm.total_items_purchased, + clm.total_items_returned, + clm.first_order_date, + clm.last_order_date, + clm.days_since_last_order, + -- Order details + ot.order_id, + ot.order_number, + ot.order_date, + ot.order_status, + ot.order_type, + ot.order_channel, + ot.payment_method, + ot.payment_status, + ot.subtotal_amount, + ot.tax_amount, + ot.shipping_amount, + ot.discount_amount, + ot.total_amount, + ot.shipping_method, + ot.shipping_carrier, + ot.tracking_number, + ot.delivery_status, + -- Line item details + oli.line_item_id, + oli.product_id, + oli.quantity_ordered, + oli.quantity_shipped, + oli.unit_price, + oli.line_total, + oli.discount_type, + oli.line_status, + -- Product information + pc.product_sku, + pc.product_name, + pc.product_category, + pc.product_subcategory, + pc.product_brand, + pc.product_manufacturer, + pc.category_group, + pc.list_price, + pc.product_color, + pc.product_size, + pc.average_rating, + pc.review_count, + -- Product performance + pp.total_quantity_sold AS product_total_quantity_sold, + pp.total_revenue AS product_total_revenue, + pp.total_profit AS product_total_profit, + pp.return_rate AS product_return_rate, + pp.profit_margin AS product_profit_margin, + pp.revenue_rank_in_category, + -- Employee information + ed.employee_id, + ed.employee_name, + ed.job_title, + ed.department_name, + ed.office_location, + ed.commission_rate, + ed.sales_quota, + -- Regional metrics + rs.total_orders AS region_total_orders, + rs.unique_customers AS region_unique_customers, + rs.total_revenue AS region_total_revenue, + rs.average_order_value AS region_avg_order_value, + rs.year_over_year_growth AS region_yoy_growth, + -- Category metrics + ca.revenue AS category_revenue, + ca.profit AS category_profit, + ca.revenue_share AS category_revenue_share, + ca.profit_margin AS category_profit_margin, + ca.return_rate AS category_return_rate, + -- Monthly trends + mt.order_month, + mt.three_month_moving_avg, + mt.six_month_moving_avg, + -- Calculated fields + CASE + WHEN clm.lifetime_revenue > 10000 THEN 'vip' + WHEN clm.lifetime_revenue > 5000 THEN 'premium' + WHEN clm.lifetime_revenue > 1000 THEN 'standard' + ELSE 'basic' + END AS calculated_tier, + CASE + WHEN clm.days_since_last_order <= 30 THEN 'very_recent' + WHEN clm.days_since_last_order <= 90 THEN 'recent' + WHEN clm.days_since_last_order <= 180 THEN 'moderate' + ELSE 'at_risk' + END AS recency_segment, + CASE + WHEN clm.total_orders >= 50 THEN 'frequent' + WHEN clm.total_orders >= 20 THEN 'regular' + WHEN clm.total_orders >= 5 THEN 'occasional' + ELSE 'rare' + END AS frequency_segment, + oli.unit_price * oli.quantity_ordered AS calculated_line_revenue, + (oli.unit_price * oli.quantity_ordered) * (ed.commission_rate / 100) AS calculated_commission, + ROUND(oli.unit_price * oli.quantity_ordered * 0.9, 2) AS discounted_line_total, + -- Window functions + ROW_NUMBER() OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date DESC) AS order_recency_rank, + RANK() OVER (PARTITION BY cs.billing_country ORDER BY clm.lifetime_revenue DESC) AS customer_value_rank_in_country, + DENSE_RANK() OVER (PARTITION BY pc.product_category ORDER BY oli.quantity_ordered DESC) AS product_popularity_rank, + SUM(ot.total_amount) OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_customer_revenue, + AVG(ot.total_amount) OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS rolling_5_order_avg, + -- Aggregates + SUM(oli.quantity_ordered) OVER (PARTITION BY pc.product_category) AS category_total_quantity, + COUNT(DISTINCT ot.order_id) OVER (PARTITION BY cs.billing_country, DATE_TRUNC('month', ot.order_date)) AS monthly_orders_in_country, + MAX(ot.total_amount) OVER (PARTITION BY cs.customer_id) AS customer_largest_order, + MIN(ot.order_date) OVER (PARTITION BY pc.product_id) AS product_first_sale_date + + FROM customer_segments cs + INNER JOIN customer_lifetime_metrics clm ON cs.customer_id = clm.customer_id + INNER JOIN order_transactions ot ON cs.customer_id = ot.customer_id + INNER JOIN order_line_items oli ON ot.order_id = oli.order_id + INNER JOIN product_catalog pc ON oli.product_id = pc.product_id + INNER JOIN product_performance pp ON pc.product_id = pp.product_id + LEFT JOIN employee_data ed ON ot.order_id IN ( + SELECT order_id FROM employee_assignments WHERE employee_id = ed.employee_id + ) + LEFT JOIN regional_sales rs ON cs.billing_country = rs.billing_country + AND cs.billing_state = rs.billing_state + AND cs.billing_city = rs.billing_city + LEFT JOIN category_analysis ca ON pc.product_category = ca.product_category + AND pc.product_subcategory = ca.product_subcategory + LEFT JOIN monthly_trends mt ON DATE_TRUNC('month', ot.order_date) = mt.order_month + + WHERE + -- Date filters + ot.order_date >= '2023-01-01' + AND ot.order_date < '2024-12-31' + -- Status filters + AND ot.order_status IN ('processing', 'shipped', 'delivered', 'completed') + AND oli.line_status NOT IN ('cancelled', 'voided', 'rejected') + AND cs.customer_status = 'active' + AND pc.product_status = 'active' + -- Geographic filters + AND cs.billing_country IN ('USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Spain', 'Italy', 'Japan', 'Australia') + AND cs.billing_state NOT IN ('test', 'demo', 'internal') + -- Category filters + AND pc.product_category IN ('electronics', 'clothing', 'home', 'sports', 'books', 'toys', 'automotive', 'health', 'beauty', 'grocery') + AND pc.product_subcategory NOT LIKE '%test%' + -- Amount filters + AND ot.total_amount > 0 + AND ot.total_amount < 100000 + AND oli.quantity_ordered > 0 + AND oli.unit_price > 0 + -- Quality filters + AND cs.email_address NOT LIKE '%@test.com' + AND cs.email_address NOT LIKE '%@example.com' + AND cs.email_address NOT LIKE '%@invalid.com' + AND cs.customer_name NOT LIKE '%test%' + AND cs.customer_name NOT LIKE '%demo%' + AND pc.product_name NOT LIKE '%sample%' + AND pc.product_name NOT LIKE '%demo%' + -- Tier filters + AND cs.customer_tier IN ('gold', 'silver', 'bronze', 'platinum') + AND cs.activity_level IN ('highly_active', 'active') + -- Payment filters + AND ot.payment_status = 'completed' + AND ot.payment_method IN ('credit_card', 'debit_card', 'paypal', 'apple_pay', 'google_pay', 'bank_transfer') + -- Shipping filters + AND ot.delivery_status IN ('delivered', 'in_transit', 'out_for_delivery') + AND ot.shipping_method IN ('standard', 'express', 'overnight', 'two_day') + -- Channel filters + AND ot.order_channel IN ('web', 'mobile', 'tablet', 'phone', 'store', 'marketplace') + -- Null checks + AND cs.customer_id IS NOT NULL + AND ot.order_id IS NOT NULL + AND oli.product_id IS NOT NULL + AND pc.product_sku IS NOT NULL + AND ot.total_amount IS NOT NULL + + GROUP BY + cs.customer_id, cs.customer_number, cs.customer_name, cs.customer_type, cs.customer_tier, + cs.activity_level, cs.customer_tenure, cs.email_address, cs.phone_number, + cs.billing_city, cs.billing_state, cs.billing_country, cs.age_group, cs.gender, cs.income_bracket, + clm.total_orders, clm.orders_last_30_days, clm.orders_last_90_days, clm.orders_last_year, + clm.lifetime_revenue, clm.revenue_last_30_days, clm.revenue_last_90_days, clm.revenue_last_year, + clm.average_order_value, clm.unique_products_purchased, clm.total_items_purchased, + clm.total_items_returned, clm.first_order_date, clm.last_order_date, clm.days_since_last_order, + ot.order_id, ot.order_number, ot.order_date, ot.order_status, ot.order_type, ot.order_channel, + ot.payment_method, ot.payment_status, ot.subtotal_amount, ot.tax_amount, ot.shipping_amount, + ot.discount_amount, ot.total_amount, ot.shipping_method, ot.shipping_carrier, ot.tracking_number, + ot.delivery_status, oli.line_item_id, oli.product_id, oli.quantity_ordered, oli.quantity_shipped, + oli.unit_price, oli.line_total, oli.discount_type, oli.line_status, + pc.product_sku, pc.product_name, pc.product_category, pc.product_subcategory, pc.product_brand, + pc.product_manufacturer, pc.category_group, pc.list_price, pc.product_color, pc.product_size, + pc.average_rating, pc.review_count, pp.total_quantity_sold, pp.total_revenue, pp.total_profit, + pp.return_rate, pp.profit_margin, pp.revenue_rank_in_category, + ed.employee_id, ed.employee_name, ed.job_title, ed.department_name, ed.office_location, + ed.commission_rate, ed.sales_quota, rs.total_orders, rs.unique_customers, rs.total_revenue, + rs.average_order_value, rs.year_over_year_growth, + ca.revenue, ca.profit, ca.revenue_share, ca.profit_margin, ca.return_rate, + mt.order_month, mt.three_month_moving_avg, mt.six_month_moving_avg + + HAVING + SUM(oli.quantity_ordered) > 0 + AND SUM(oli.line_total) > 0 + AND COUNT(DISTINCT ot.order_id) >= 1 + + ORDER BY + clm.lifetime_revenue DESC, + clm.total_orders DESC, + ot.order_date DESC, + cs.customer_name ASC, + pc.product_category ASC, + pc.product_name ASC, + oli.line_number ASC, + ot.order_id ASC + + LIMIT 100000 + OFFSET 0; + + -- Additional analytics queries for dashboard + + -- Top customers by revenue + SELECT + customer_id, + customer_name, + customer_tier, + total_orders, + lifetime_revenue, + average_order_value, + days_since_last_order + FROM customer_lifetime_metrics + WHERE lifetime_revenue > 1000 + ORDER BY lifetime_revenue DESC + LIMIT 100; + + -- Top products by sales + SELECT + product_sku, + product_name, + product_category, + product_brand, + total_quantity_sold, + total_revenue, + total_profit, + profit_margin, + return_rate + FROM product_performance + WHERE total_revenue > 5000 + ORDER BY total_revenue DESC + LIMIT 50; + + -- Regional performance summary + SELECT + billing_country, + billing_state, + total_orders, + unique_customers, + total_revenue, + average_order_value, + year_over_year_growth + FROM regional_sales + WHERE total_revenue > 10000 + ORDER BY total_revenue DESC; +"#; + +fn tokenization_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("tokenization"); + let dialect = GenericDialect {}; + + group.bench_function("tokenize_complex_sql", |b| { + b.iter(|| { + let mut tokenizer = Tokenizer::new(&dialect, COMPLEX_SQL); + tokenizer.tokenize().unwrap() + }); + }); + + group.finish(); +} + +criterion_group!(benches, tokenization_benchmark); +criterion_main!(benches); diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index a696c101..3ea0fefb 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -1251,7 +1251,7 @@ pub fn parse_copy_into(parser: &Parser) -> Result { continue_loop = false; let next_token = parser.next_token(); match next_token.token { - BorrowedToken::SingleQuotedString(s) => files.push(s), + BorrowedToken::SingleQuotedString(s) => files.push(s.into_owned()), _ => parser.expected("file token", next_token)?, }; if parser.next_token().token.eq(&BorrowedToken::Comma) { @@ -1266,7 +1266,7 @@ pub fn parse_copy_into(parser: &Parser) -> Result { parser.expect_token(&BorrowedToken::Eq)?; let next_token = parser.next_token(); pattern = Some(match next_token.token { - BorrowedToken::SingleQuotedString(s) => s, + BorrowedToken::SingleQuotedString(s) => s.into_owned(), _ => parser.expected("pattern", next_token)?, }); // VALIDATION MODE @@ -1417,7 +1417,7 @@ fn parse_stage_params(parser: &Parser) -> Result if parser.parse_keyword(Keyword::URL) { parser.expect_token(&BorrowedToken::Eq)?; url = Some(match parser.next_token().token { - BorrowedToken::SingleQuotedString(word) => Ok(word), + BorrowedToken::SingleQuotedString(word) => Ok(word.into_owned()), _ => parser.expected("a URL statement", parser.peek_token()), }?) } @@ -1432,7 +1432,7 @@ fn parse_stage_params(parser: &Parser) -> Result if parser.parse_keyword(Keyword::ENDPOINT) { parser.expect_token(&BorrowedToken::Eq)?; endpoint = Some(match parser.next_token().token { - BorrowedToken::SingleQuotedString(word) => Ok(word), + BorrowedToken::SingleQuotedString(word) => Ok(word.into_owned()), _ => parser.expected("an endpoint statement", parser.peek_token()), }?) } @@ -1486,7 +1486,7 @@ fn parse_session_options(parser: &Parser, set: bool) -> Result, Keyword>> = OnceLock::new(); + +/// Get the HashMap of keywords, initializing it on first access +#[cfg(feature = "std")] +fn get_keyword_map() -> &'static HashMap, Keyword> { + KEYWORD_MAP.get_or_init(|| { + let mut map = HashMap::with_capacity(ALL_KEYWORDS.len()); + for (keyword_str, keyword_enum) in ALL_KEYWORDS.iter().zip(ALL_KEYWORDS_INDEX.iter()) { + map.insert(UniCase::ascii(*keyword_str), *keyword_enum); + } + map + }) +} + +/// Look up a keyword by string, case-insensitively, with O(1) complexity +/// +/// # Arguments +/// * `word` - The word to look up (case-insensitive) +/// +/// # Returns +/// * `Some(Keyword)` if the word is a keyword +/// * `None` if the word is not a keyword +/// +/// # Example +/// ``` +/// use sqlparser::keywords::{get_keyword, Keyword}; +/// +/// assert_eq!(get_keyword("SELECT"), Some(Keyword::SELECT)); +/// assert_eq!(get_keyword("select"), Some(Keyword::SELECT)); +/// assert_eq!(get_keyword("my_table"), None); +/// ``` +#[cfg(feature = "std")] +pub fn get_keyword(word: &str) -> Option { + get_keyword_map().get(&UniCase::ascii(word)).copied() +} + +/// Fallback for no_std: use binary search (same as before) +#[cfg(not(feature = "std"))] +pub fn get_keyword(word: &str) -> Option { + ALL_KEYWORDS + .binary_search_by(|k| unicase::UniCase::ascii(k).cmp(&unicase::UniCase::ascii(&word))) + .ok() + .map(|idx| ALL_KEYWORDS_INDEX[idx]) +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 0d1285d6..8a14aa66 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -14,6 +14,7 @@ #[cfg(not(feature = "std"))] use alloc::{ + borrow::Cow, boxed::Box, format, string::{String, ToString}, @@ -25,6 +26,8 @@ use core::{ str::FromStr, }; use helpers::attached_token::AttachedToken; +#[cfg(feature = "std")] +use std::borrow::Cow; use log::debug; @@ -1793,8 +1796,11 @@ impl<'a> Parser<'a> { break; } BorrowedToken::SingleQuotedString(s) => { - let expr = - Expr::Identifier(Ident::with_quote_and_span('\'', next_token.span, s)); + let expr = Expr::Identifier(Ident::with_quote_and_span( + '\'', + next_token.span, + s.as_ref(), + )); chain.push(AccessExpr::Dot(expr)); self.advance_token(); // The consumed string } @@ -3893,7 +3899,7 @@ impl<'a> Parser<'a> { // any keyword here unquoted. keyword: _, }) => Ok(JsonPathElem::Dot { - key: value, + key: value.to_string(), quoted: quote_style.is_some(), }), @@ -7744,7 +7750,9 @@ impl<'a> Parser<'a> { if dialect_of!(self is HiveDialect) && self.parse_keyword(Keyword::COMMENT) { let next_token = self.next_token(); match next_token.token { - BorrowedToken::SingleQuotedString(str) => Some(CommentDef::WithoutEq(str)), + BorrowedToken::SingleQuotedString(str) => { + Some(CommentDef::WithoutEq(str.into_owned())) + } _ => self.expected("comment", next_token)?, } } else { @@ -7965,11 +7973,11 @@ impl<'a> Parser<'a> { let comment = match (has_eq, value.token) { (true, BorrowedToken::SingleQuotedString(s)) => { - Ok(Some(SqlOption::Comment(CommentDef::WithEq(s)))) - } - (false, BorrowedToken::SingleQuotedString(s)) => { - Ok(Some(SqlOption::Comment(CommentDef::WithoutEq(s)))) + Ok(Some(SqlOption::Comment(CommentDef::WithEq(s.into_owned())))) } + (false, BorrowedToken::SingleQuotedString(s)) => Ok(Some(SqlOption::Comment( + CommentDef::WithoutEq(s.into_owned()), + ))), (_, token) => self.expected( "BorrowedToken::SingleQuotedString", TokenWithSpan::wrap(token), @@ -8014,8 +8022,8 @@ impl<'a> Parser<'a> { let value = self.next_token(); let tablespace = match value.token { - BorrowedToken::Word(Word { value: name, .. }) - | BorrowedToken::SingleQuotedString(name) => { + BorrowedToken::Word(Word { value: name, .. }) => { + let name = name.to_string(); let storage = match self.parse_keyword(Keyword::STORAGE) { true => { let _ = self.consume_token(&BorrowedToken::Eq); @@ -8038,6 +8046,28 @@ impl<'a> Parser<'a> { storage, }))) } + BorrowedToken::SingleQuotedString(name) => { + let storage = match self.parse_keyword(Keyword::STORAGE) { + true => { + let _ = self.consume_token(&BorrowedToken::Eq); + let storage_token = self.next_token(); + match &storage_token.token { + BorrowedToken::Word(w) => match w.value.to_uppercase().as_str() { + "DISK" => Some(StorageType::Disk), + "MEMORY" => Some(StorageType::Memory), + _ => self.expected("DISK or MEMORY", storage_token)?, + }, + _ => self.expected("BorrowedToken::Word", storage_token)?, + } + } + false => None, + }; + + Ok(Some(SqlOption::TableSpace(TablespaceOption { + name: name.into_owned(), + storage, + }))) + } _ => { return self.expected("BorrowedToken::Word", value)?; } @@ -8176,7 +8206,7 @@ impl<'a> Parser<'a> { pub fn parse_comment_value(&self) -> Result { let next_token = self.next_token(); let value = match next_token.token { - BorrowedToken::SingleQuotedString(str) => str, + BorrowedToken::SingleQuotedString(str) => str.into_owned(), BorrowedToken::DollarQuotedString(str) => str.value, _ => self.expected("string literal", next_token)?, }; @@ -10381,8 +10411,8 @@ impl<'a> Parser<'a> { } Keyword::NULL => ok_value(Value::Null), Keyword::NoKeyword if w.quote_style.is_some() => match w.quote_style { - Some('"') => ok_value(Value::DoubleQuotedString(w.value)), - Some('\'') => ok_value(Value::SingleQuotedString(w.value)), + Some('"') => ok_value(Value::DoubleQuotedString(w.value.into_owned())), + Some('\'') => ok_value(Value::SingleQuotedString(w.value.into_owned())), _ => self.expected( "A value?", TokenWithSpan { @@ -10484,11 +10514,18 @@ impl<'a> Parser<'a> { fn maybe_concat_string_literal(&self, mut str: String) -> String { if self.dialect.supports_string_literal_concatenation() { - while let BorrowedToken::SingleQuotedString(ref s) - | BorrowedToken::DoubleQuotedString(ref s) = self.peek_token_ref().token - { - str.push_str(s.clone().as_str()); - self.advance_token(); + loop { + match &self.peek_token_ref().token { + BorrowedToken::SingleQuotedString(s) => { + str.push_str(s.as_ref()); + self.advance_token(); + } + BorrowedToken::DoubleQuotedString(s) => { + str.push_str(s); + self.advance_token(); + } + _ => break, + } } } str @@ -10584,8 +10621,8 @@ impl<'a> Parser<'a> { value, keyword: Keyword::NoKeyword, .. - }) => Ok(value), - BorrowedToken::SingleQuotedString(s) => Ok(s), + }) => Ok(value.into_owned()), + BorrowedToken::SingleQuotedString(s) => Ok(s.into_owned()), BorrowedToken::DoubleQuotedString(s) => Ok(s), BorrowedToken::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { Ok(s) @@ -11100,7 +11137,7 @@ impl<'a> Parser<'a> { loop { let next_token = self.next_token(); match next_token.token { - BorrowedToken::SingleQuotedString(value) => values.push(value), + BorrowedToken::SingleQuotedString(value) => values.push(value.into_owned()), _ => self.expected("a string", next_token)?, } let next_token = self.next_token(); @@ -12125,7 +12162,7 @@ impl<'a> Parser<'a> { match next_token.token { BorrowedToken::Word(w) => modifiers.push(w.to_string()), BorrowedToken::Number(n, _) => modifiers.push(n), - BorrowedToken::SingleQuotedString(s) => modifiers.push(s), + BorrowedToken::SingleQuotedString(s) => modifiers.push(s.into_owned()), BorrowedToken::Comma => { continue; @@ -13261,7 +13298,7 @@ impl<'a> Parser<'a> { if token2 == BorrowedToken::Period { match token1.token { BorrowedToken::Word(w) => { - schema_name = w.value; + schema_name = w.value.to_string(); } _ => { return self.expected("Schema name", token1); @@ -13269,7 +13306,7 @@ impl<'a> Parser<'a> { } match token3.token { BorrowedToken::Word(w) => { - table_name = w.value; + table_name = w.value.to_string(); } _ => { return self.expected("Table name", token3); @@ -13282,7 +13319,7 @@ impl<'a> Parser<'a> { } else { match token1.token { BorrowedToken::Word(w) => { - table_name = w.value; + table_name = w.value.to_string(); } _ => { return self.expected("Table name", token1); @@ -14408,7 +14445,9 @@ impl<'a> Parser<'a> { None => { let next_token = self.next_token(); if let BorrowedToken::Word(w) = next_token.token { - Expr::Value(Value::Placeholder(w.value).with_span(next_token.span)) + Expr::Value( + Value::Placeholder(w.value.into_owned()).with_span(next_token.span), + ) } else { return parser_err!( "Expecting number or byte length e.g. 100M", @@ -14962,7 +15001,7 @@ impl<'a> Parser<'a> { let r#type = self.parse_data_type()?; let path = if let BorrowedToken::SingleQuotedString(path) = self.peek_token().token { self.next_token(); - Some(path) + Some(path.into_owned()) } else { None }; @@ -16491,7 +16530,7 @@ impl<'a> Parser<'a> { let opt_ilike = if self.parse_keyword(Keyword::ILIKE) { let next_token = self.next_token(); let pattern = match next_token.token { - BorrowedToken::SingleQuotedString(s) => s, + BorrowedToken::SingleQuotedString(s) => s.into_owned(), _ => return self.expected("ilike pattern", next_token), }; Some(IlikeSelectItem { pattern }) @@ -17128,7 +17167,11 @@ impl<'a> Parser<'a> { (true, _) => BorrowedToken::RParen, (false, BorrowedToken::EOF) => BorrowedToken::EOF, (false, BorrowedToken::Word(w)) if end_kws.contains(&w.keyword) => { - BorrowedToken::Word(w) + BorrowedToken::Word(Word { + value: Cow::Owned(w.value.into_owned()), + quote_style: w.quote_style, + keyword: w.keyword, + }) } (false, _) => BorrowedToken::SemiColon, }; @@ -18327,27 +18370,27 @@ impl<'a> Parser<'a> { self.expect_token(&BorrowedToken::Eq)?; match self.peek_token().token { BorrowedToken::SingleQuotedString(_) => Ok(KeyValueOption { - option_name: key.value.clone(), + option_name: key.value.to_string(), option_value: KeyValueOptionKind::Single(self.parse_value()?.into()), }), BorrowedToken::Word(word) if word.keyword == Keyword::TRUE || word.keyword == Keyword::FALSE => { Ok(KeyValueOption { - option_name: key.value.clone(), + option_name: key.value.to_string(), option_value: KeyValueOptionKind::Single(self.parse_value()?.into()), }) } BorrowedToken::Number(..) => Ok(KeyValueOption { - option_name: key.value.clone(), + option_name: key.value.to_string(), option_value: KeyValueOptionKind::Single(self.parse_value()?.into()), }), BorrowedToken::Word(word) => { self.next_token(); Ok(KeyValueOption { - option_name: key.value.clone(), + option_name: key.value.to_string(), option_value: KeyValueOptionKind::Single(Value::Placeholder( - word.value.clone(), + word.value.to_string(), )), }) } @@ -18365,12 +18408,12 @@ impl<'a> Parser<'a> { Some(values) => { let values = values.into_iter().map(|v| v.value).collect(); Ok(KeyValueOption { - option_name: key.value.clone(), + option_name: key.value.to_string(), option_value: KeyValueOptionKind::Multi(values), }) } None => Ok(KeyValueOption { - option_name: key.value.clone(), + option_name: key.value.to_string(), option_value: KeyValueOptionKind::KeyValueOptions(Box::new( self.parse_key_value_options(true, &[])?, )), @@ -18405,11 +18448,11 @@ fn maybe_prefixed_expr(expr: Expr, prefix: Option) -> Expr { } } -impl Word { +impl Word<'_> { #[deprecated(since = "0.54.0", note = "please use `into_ident` instead")] pub fn to_ident(&self, span: Span) -> Ident { Ident { - value: self.value.clone(), + value: self.value.to_string(), quote_style: self.quote_style, span, } @@ -18418,7 +18461,7 @@ impl Word { /// Convert this word into an [`Ident`] identifier pub fn into_ident(self, span: Span) -> Ident { Ident { - value: self.value, + value: self.value.into_owned(), quote_style: self.quote_style, span, } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index fef4fe10..1f6d66c2 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -23,7 +23,7 @@ #[cfg(not(feature = "std"))] use alloc::{ - borrow::{Cow, ToOwned}, + borrow::Cow, format, string::{String, ToString}, vec, @@ -48,7 +48,7 @@ use crate::dialect::{ BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect, SnowflakeDialect, }; -use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX}; +use crate::keywords::Keyword; use crate::{ast::DollarQuotedString, dialect::HiveDialect}; /// SQL Token enumeration with lifetime parameter for future zero-copy support @@ -59,13 +59,13 @@ pub enum BorrowedToken<'a> { /// An end-of-file marker, not a real token EOF, /// A keyword (like SELECT) or an optionally quoted SQL identifier - Word(Word), + Word(Word<'a>), /// An unsigned numeric literal Number(String, bool), /// A character that could not be tokenized Char(char), /// Single quoted string: i.e: 'string' - SingleQuotedString(String), + SingleQuotedString(Cow<'a, str>), /// Double quoted string: i.e: "string" DoubleQuotedString(String), /// Triple single quoted strings: Example '''abc''' @@ -110,7 +110,7 @@ pub enum BorrowedToken<'a> { /// Comma Comma, /// Whitespace (space, tab, etc) - Whitespace(Whitespace), + Whitespace(Whitespace<'a>), /// Double equals sign `==` DoubleEq, /// Equality operator `=` @@ -280,8 +280,6 @@ pub enum BorrowedToken<'a> { /// This is used to represent any custom binary operator that is not part of the SQL standard. /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR. CustomBinaryOperator(String), - /// Marker to carry the lifetime parameter (never constructed) - _Phantom(Cow<'a, str>), } /// Type alias for backward compatibility - Token without explicit lifetime uses 'static @@ -399,7 +397,6 @@ impl<'a> fmt::Display for BorrowedToken<'a> { BorrowedToken::QuestionAnd => write!(f, "?&"), BorrowedToken::QuestionPipe => write!(f, "?|"), BorrowedToken::CustomBinaryOperator(s) => f.write_str(s), - BorrowedToken::_Phantom(_) => unreachable!("_Phantom should never be constructed"), } } } @@ -409,10 +406,16 @@ impl<'a> BorrowedToken<'a> { pub fn to_static(self) -> Token { match self { BorrowedToken::EOF => BorrowedToken::EOF, - BorrowedToken::Word(w) => BorrowedToken::Word(w), + BorrowedToken::Word(w) => BorrowedToken::Word(Word { + value: Cow::Owned(w.value.into_owned()), + quote_style: w.quote_style, + keyword: w.keyword, + }), BorrowedToken::Number(n, l) => BorrowedToken::Number(n, l), BorrowedToken::Char(c) => BorrowedToken::Char(c), - BorrowedToken::SingleQuotedString(s) => BorrowedToken::SingleQuotedString(s), + BorrowedToken::SingleQuotedString(s) => { + BorrowedToken::SingleQuotedString(Cow::Owned(s.into_owned())) + } BorrowedToken::DoubleQuotedString(s) => BorrowedToken::DoubleQuotedString(s), BorrowedToken::TripleSingleQuotedString(s) => { BorrowedToken::TripleSingleQuotedString(s) @@ -450,7 +453,20 @@ impl<'a> BorrowedToken<'a> { BorrowedToken::UnicodeStringLiteral(s) => BorrowedToken::UnicodeStringLiteral(s), BorrowedToken::HexStringLiteral(s) => BorrowedToken::HexStringLiteral(s), BorrowedToken::Comma => BorrowedToken::Comma, - BorrowedToken::Whitespace(ws) => BorrowedToken::Whitespace(ws), + BorrowedToken::Whitespace(ws) => BorrowedToken::Whitespace(match ws { + Whitespace::Space => Whitespace::Space, + Whitespace::Newline => Whitespace::Newline, + Whitespace::Tab => Whitespace::Tab, + Whitespace::SingleLineComment { comment, prefix } => { + Whitespace::SingleLineComment { + comment: Cow::Owned(comment.into_owned()), + prefix: Cow::Owned(prefix.into_owned()), + } + } + Whitespace::MultiLineComment(s) => { + Whitespace::MultiLineComment(Cow::Owned(s.into_owned())) + } + }), BorrowedToken::DoubleEq => BorrowedToken::DoubleEq, BorrowedToken::Eq => BorrowedToken::Eq, BorrowedToken::Neq => BorrowedToken::Neq, @@ -545,7 +561,6 @@ impl<'a> BorrowedToken<'a> { BorrowedToken::QuestionAnd => BorrowedToken::QuestionAnd, BorrowedToken::QuestionPipe => BorrowedToken::QuestionPipe, BorrowedToken::CustomBinaryOperator(s) => BorrowedToken::CustomBinaryOperator(s), - BorrowedToken::_Phantom(_) => unreachable!("_Phantom should never be constructed"), } } } @@ -556,13 +571,26 @@ impl BorrowedToken<'static> { } pub fn make_word(word: &str, quote_style: Option) -> Self { - let word_uppercase = word.to_uppercase(); BorrowedToken::Word(Word { - value: word.to_string(), + value: Cow::Owned(word.to_string()), quote_style, keyword: if quote_style.is_none() { - let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str()); - keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x]) + crate::keywords::get_keyword(word).unwrap_or(Keyword::NoKeyword) + } else { + Keyword::NoKeyword + }, + }) + } +} + +impl<'a> BorrowedToken<'a> { + /// Create a Word token with a borrowed string (zero-copy) + pub fn make_word_borrowed(word: &'a str, quote_style: Option) -> Self { + BorrowedToken::Word(Word { + value: Cow::Borrowed(word), + quote_style, + keyword: if quote_style.is_none() { + crate::keywords::get_keyword(word).unwrap_or(Keyword::NoKeyword) } else { Keyword::NoKeyword }, @@ -574,10 +602,10 @@ impl BorrowedToken<'static> { #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] -pub struct Word { +pub struct Word<'a> { /// The value of the token, without the enclosing quotes, and with the /// escape sequences (if any) processed (TODO: escapes are not handled) - pub value: String, + pub value: Cow<'a, str>, /// An identifier can be "quoted" (<delimited identifier> in ANSI parlance). /// The standard and most implementations allow using double quotes for this, /// but some implementations support other quoting styles as well (e.g. \[MS SQL]) @@ -587,7 +615,7 @@ pub struct Word { pub keyword: Keyword, } -impl fmt::Display for Word { +impl fmt::Display for Word<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.quote_style { Some(s) if s == '"' || s == '[' || s == '`' => { @@ -599,7 +627,7 @@ impl fmt::Display for Word { } } -impl Word { +impl Word<'_> { fn matching_end_quote(ch: char) -> char { match ch { '"' => '"', // ANSI and most dialects @@ -613,15 +641,18 @@ impl Word { #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] -pub enum Whitespace { +pub enum Whitespace<'a> { Space, Newline, Tab, - SingleLineComment { comment: String, prefix: String }, - MultiLineComment(String), + SingleLineComment { + comment: Cow<'a, str>, + prefix: Cow<'a, str>, + }, + MultiLineComment(Cow<'a, str>), } -impl fmt::Display for Whitespace { +impl fmt::Display for Whitespace<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Whitespace::Space => f.write_str(" "), @@ -1016,7 +1047,7 @@ impl<'a> Tokenizer<'a> { /// assert_eq!(tokens, vec![ /// Token::make_word("SELECT", None), /// Token::Whitespace(Whitespace::Space), - /// Token::SingleQuotedString("foo".to_string()), + /// Token::SingleQuotedString("foo".to_string().into()), /// ]); pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { Self { @@ -1117,15 +1148,18 @@ impl<'a> Tokenizer<'a> { &self, consumed_byte_len: usize, chars: &mut State<'a>, - ) -> Result, TokenizerError> { + ) -> Result>, TokenizerError> { chars.next(); // consume the first char - let word = self.tokenize_word(consumed_byte_len, chars)?; + + // Calculate where the first character started + let first_char_byte_pos = chars.byte_pos.saturating_sub(consumed_byte_len); + let word = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; // TODO: implement parsing of exponent here if word.chars().all(|x| x.is_ascii_digit() || x == '.') { let mut inner_state = State { peekable: word.chars().peekable(), - source: &word, + source: word, line: 0, col: 0, byte_pos: 0, @@ -1136,7 +1170,7 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::Number(s, false))); } - Ok(Some(Token::make_word(&word, None))) + Ok(Some(BorrowedToken::make_word_borrowed(word, None))) } /// Get the next token or return None @@ -1144,7 +1178,7 @@ impl<'a> Tokenizer<'a> { &self, chars: &mut State<'a>, prev_token: Option<&BorrowedToken<'a>>, - ) -> Result, TokenizerError> { + ) -> Result>, TokenizerError> { match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), @@ -1166,12 +1200,12 @@ impl<'a> Tokenizer<'a> { Some('\'') => { if self.dialect.supports_triple_quoted_string() { return self - .tokenize_single_or_triple_quoted_string:: Token>( + .tokenize_single_or_triple_quoted_string:: BorrowedToken<'a>>( chars, '\'', false, - Token::SingleQuotedByteStringLiteral, - Token::TripleSingleQuotedByteStringLiteral, + BorrowedToken::SingleQuotedByteStringLiteral, + BorrowedToken::TripleSingleQuotedByteStringLiteral, ); } let s = self.tokenize_single_quoted_string(chars, '\'', false)?; @@ -1180,12 +1214,12 @@ impl<'a> Tokenizer<'a> { Some('\"') => { if self.dialect.supports_triple_quoted_string() { return self - .tokenize_single_or_triple_quoted_string:: Token>( + .tokenize_single_or_triple_quoted_string:: BorrowedToken<'a>>( chars, '"', false, - Token::DoubleQuotedByteStringLiteral, - Token::TripleDoubleQuotedByteStringLiteral, + BorrowedToken::DoubleQuotedByteStringLiteral, + BorrowedToken::TripleDoubleQuotedByteStringLiteral, ); } let s = self.tokenize_single_quoted_string(chars, '\"', false)?; @@ -1193,8 +1227,9 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "b" or "B" - let s = self.tokenize_word(b.len_utf8(), chars)?; - Ok(Some(Token::make_word(&s, None))) + let first_char_byte_pos = chars.byte_pos.saturating_sub(b.len_utf8()); + let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; + Ok(Some(BorrowedToken::make_word_borrowed(s, None))) } } } @@ -1203,25 +1238,26 @@ impl<'a> Tokenizer<'a> { chars.next(); // consume match chars.peek() { Some('\'') => self - .tokenize_single_or_triple_quoted_string:: Token>( + .tokenize_single_or_triple_quoted_string:: BorrowedToken<'a>>( chars, '\'', false, - Token::SingleQuotedRawStringLiteral, - Token::TripleSingleQuotedRawStringLiteral, + BorrowedToken::SingleQuotedRawStringLiteral, + BorrowedToken::TripleSingleQuotedRawStringLiteral, ), Some('\"') => self - .tokenize_single_or_triple_quoted_string:: Token>( + .tokenize_single_or_triple_quoted_string:: BorrowedToken<'a>>( chars, '"', false, - Token::DoubleQuotedRawStringLiteral, - Token::TripleDoubleQuotedRawStringLiteral, + BorrowedToken::DoubleQuotedRawStringLiteral, + BorrowedToken::TripleDoubleQuotedRawStringLiteral, ), _ => { // regular identifier starting with an "r" or "R" - let s = self.tokenize_word(b.len_utf8(), chars)?; - Ok(Some(Token::make_word(&s, None))) + let first_char_byte_pos = chars.byte_pos.saturating_sub(b.len_utf8()); + let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; + Ok(Some(BorrowedToken::make_word_borrowed(s, None))) } } } @@ -1239,8 +1275,9 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "N" - let s = self.tokenize_word(n.len_utf8(), chars)?; - Ok(Some(Token::make_word(&s, None))) + let first_char_byte_pos = chars.byte_pos.saturating_sub(n.len_utf8()); + let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; + Ok(Some(BorrowedToken::make_word_borrowed(s, None))) } } } @@ -1256,8 +1293,9 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "E" or "e" - let s = self.tokenize_word(x.len_utf8(), chars)?; - Ok(Some(Token::make_word(&s, None))) + let first_char_byte_pos = chars.byte_pos.saturating_sub(x.len_utf8()); + let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; + Ok(Some(BorrowedToken::make_word_borrowed(s, None))) } } } @@ -1275,8 +1313,9 @@ impl<'a> Tokenizer<'a> { } } // regular identifier starting with an "U" or "u" - let s = self.tokenize_word(x.len_utf8(), chars)?; - Ok(Some(Token::make_word(&s, None))) + let first_char_byte_pos = chars.byte_pos.saturating_sub(x.len_utf8()); + let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; + Ok(Some(BorrowedToken::make_word_borrowed(s, None))) } // The spec only allows an uppercase 'X' to introduce a hex // string, but PostgreSQL, at least, allows a lowercase 'x' too. @@ -1290,8 +1329,9 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "X" - let s = self.tokenize_word(x.len_utf8(), chars)?; - Ok(Some(Token::make_word(&s, None))) + let first_char_byte_pos = chars.byte_pos.saturating_sub(x.len_utf8()); + let s = self.tokenize_word_borrowed(first_char_byte_pos, chars)?; + Ok(Some(BorrowedToken::make_word_borrowed(s, None))) } } } @@ -1299,21 +1339,21 @@ impl<'a> Tokenizer<'a> { '\'' => { if self.dialect.supports_triple_quoted_string() { return self - .tokenize_single_or_triple_quoted_string:: Token>( + .tokenize_single_or_triple_quoted_string:: BorrowedToken<'a>>( chars, '\'', self.dialect.supports_string_literal_backslash_escape(), - Token::SingleQuotedString, - Token::TripleSingleQuotedString, + |s| BorrowedToken::SingleQuotedString(Cow::Owned(s)), + BorrowedToken::TripleSingleQuotedString, ); } - let s = self.tokenize_single_quoted_string( + let s = self.tokenize_single_quoted_string_borrowed( chars, '\'', self.dialect.supports_string_literal_backslash_escape(), )?; - Ok(Some(Token::SingleQuotedString(s))) + Ok(Some(BorrowedToken::SingleQuotedString(s))) } // double quoted string '\"' if !self.dialect.is_delimited_identifier_start(ch) @@ -1321,12 +1361,12 @@ impl<'a> Tokenizer<'a> { { if self.dialect.supports_triple_quoted_string() { return self - .tokenize_single_or_triple_quoted_string:: Token>( + .tokenize_single_or_triple_quoted_string:: BorrowedToken<'a>>( chars, '"', self.dialect.supports_string_literal_backslash_escape(), - Token::DoubleQuotedString, - Token::TripleDoubleQuotedString, + BorrowedToken::DoubleQuotedString, + BorrowedToken::TripleDoubleQuotedString, ); } let s = self.tokenize_single_quoted_string( @@ -1536,11 +1576,11 @@ impl<'a> Tokenizer<'a> { if is_comment { chars.next(); // consume second '-' - let comment = self.tokenize_single_line_comment(chars)?; - return Ok(Some(Token::Whitespace( + let comment = self.tokenize_single_line_comment_borrowed(chars)?; + return Ok(Some(BorrowedToken::Whitespace( Whitespace::SingleLineComment { - prefix: "--".to_owned(), - comment, + prefix: Cow::Borrowed("--"), + comment: Cow::Borrowed(comment), }, ))); } @@ -1567,11 +1607,13 @@ impl<'a> Tokenizer<'a> { } Some('/') if dialect_of!(self is SnowflakeDialect) => { chars.next(); // consume the second '/', starting a snowflake single-line comment - let comment = self.tokenize_single_line_comment(chars)?; - Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { - prefix: "//".to_owned(), - comment, - }))) + let comment = self.tokenize_single_line_comment_borrowed(chars)?; + Ok(Some(BorrowedToken::Whitespace( + Whitespace::SingleLineComment { + prefix: Cow::Borrowed("//"), + comment: Cow::Borrowed(comment), + }, + ))) } Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => { self.consume_and_return(chars, Token::DuckIntDiv) @@ -1773,11 +1815,13 @@ impl<'a> Tokenizer<'a> { '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) => { chars.next(); // consume the '#', starting a snowflake single-line comment - let comment = self.tokenize_single_line_comment(chars)?; - Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { - prefix: "#".to_owned(), - comment, - }))) + let comment = self.tokenize_single_line_comment_borrowed(chars)?; + Ok(Some(BorrowedToken::Whitespace( + Whitespace::SingleLineComment { + prefix: Cow::Borrowed("#"), + comment: Cow::Borrowed(comment), + }, + ))) } '~' => { chars.next(); // consume @@ -1923,10 +1967,10 @@ impl<'a> Tokenizer<'a> { /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix fn consume_for_binop( &self, - chars: &mut State, + chars: &mut State<'a>, prefix: &str, - default: Token, - ) -> Result, TokenizerError> { + default: BorrowedToken<'a>, + ) -> Result>, TokenizerError> { chars.next(); // consume the first char self.start_binop_opt(chars, prefix, Some(default)) } @@ -1934,20 +1978,20 @@ impl<'a> Tokenizer<'a> { /// parse a custom binary operator fn start_binop( &self, - chars: &mut State, + chars: &mut State<'a>, prefix: &str, - default: Token, - ) -> Result, TokenizerError> { + default: BorrowedToken<'a>, + ) -> Result>, TokenizerError> { self.start_binop_opt(chars, prefix, Some(default)) } /// parse a custom binary operator fn start_binop_opt( &self, - chars: &mut State, + chars: &mut State<'a>, prefix: &str, - default: Option, - ) -> Result, TokenizerError> { + default: Option>, + ) -> Result>, TokenizerError> { let mut custom = None; while let Some(&ch) = chars.peek() { if !self.dialect.is_custom_operator_part(ch) { @@ -2132,16 +2176,6 @@ impl<'a> Tokenizer<'a> { }) } - // Consume characters until newline - fn tokenize_single_line_comment( - &self, - chars: &mut State<'a>, - ) -> Result { - Ok(self - .tokenize_single_line_comment_borrowed(chars)? - .to_string()) - } - /// Tokenize a single-line comment, returning a borrowed slice. /// Returns a slice that includes the terminating newline character. fn tokenize_single_line_comment_borrowed( @@ -2167,29 +2201,6 @@ impl<'a> Tokenizer<'a> { self.safe_slice(chars.source, start_pos, chars.byte_pos, error_loc) } - /// Tokenize an identifier or keyword, after the first char(s) have already been consumed. - /// `consumed_byte_len` is the byte length of the consumed character(s). - fn tokenize_word( - &self, - consumed_byte_len: usize, - chars: &mut State<'a>, - ) -> Result { - let error_loc = chars.location(); - - // Overflow check: ensure we can safely subtract - if consumed_byte_len > chars.byte_pos { - return self.tokenizer_error(error_loc, "Invalid byte position in tokenize_word"); - } - - // Calculate where the first character started - let first_char_byte_pos = chars.byte_pos - consumed_byte_len; - - // Use the zero-copy version and convert to String - Ok(self - .tokenize_word_borrowed(first_char_byte_pos, chars)? - .to_string()) - } - /// Tokenize an identifier or keyword, returning a borrowed slice when possible. /// The first character position must be provided (before it was consumed). /// Returns a slice with the same lifetime as the State's source. @@ -2245,14 +2256,14 @@ impl<'a> Tokenizer<'a> { /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`. fn tokenize_single_or_triple_quoted_string( &self, - chars: &mut State, + chars: &mut State<'a>, quote_style: char, backslash_escape: bool, single_quote_token: F, triple_quote_token: F, - ) -> Result, TokenizerError> + ) -> Result>, TokenizerError> where - F: Fn(String) -> Token, + F: Fn(String) -> BorrowedToken<'a>, { let error_loc = chars.location(); @@ -2316,6 +2327,79 @@ impl<'a> Tokenizer<'a> { ) } + /// Reads a string literal quoted by a single quote character, returning Cow for zero-copy. + /// Returns Cow::Borrowed when the string has no escape sequences or doubled quotes, + /// Cow::Owned when processing is required. + fn tokenize_single_quoted_string_borrowed( + &self, + chars: &mut State<'a>, + quote_style: char, + backslash_escape: bool, + ) -> Result, TokenizerError> { + let start_byte_pos = chars.byte_pos; + let error_loc = chars.location(); + + // Consume opening quote + if chars.next() != Some(quote_style) { + return self.tokenizer_error(error_loc, "Expected opening quote"); + } + + let content_start = chars.byte_pos; + let mut needs_processing = false; + + // Scan the string to detect if processing is needed + loop { + match chars.peek() { + None => { + return self.tokenizer_error(error_loc, "Unterminated string literal"); + } + Some(&ch) if ch == quote_style => { + // Found a quote - check if it's doubled or the end + let quote_pos = chars.byte_pos; + chars.next(); // consume quote + + if chars.peek() == Some("e_style) { + // Doubled quote - needs processing + needs_processing = true; + chars.next(); // consume second quote + } else { + // End of string + if needs_processing { + // Reset and use the owned version + chars.byte_pos = start_byte_pos; + chars.line = error_loc.line; + chars.col = error_loc.column; + // Recreate peekable from current position + let remaining = &chars.source[start_byte_pos..]; + chars.peekable = remaining.chars().peekable(); + + let s = self.tokenize_single_quoted_string( + chars, + quote_style, + backslash_escape, + )?; + return Ok(Cow::Owned(s)); + } else { + // Can use borrowed slice (excluding quotes) + return Ok(Cow::Borrowed(&chars.source[content_start..quote_pos])); + } + } + } + Some(&'\\') if backslash_escape => { + // Escape sequence - needs processing + needs_processing = true; + chars.next(); // consume backslash + if chars.next().is_none() { + return self.tokenizer_error(error_loc, "Unterminated string literal"); + } + } + Some(_) => { + chars.next(); // consume regular character + } + } + } + } + /// Read a quoted string. fn tokenize_quoted_string( &self, @@ -2426,11 +2510,11 @@ impl<'a> Tokenizer<'a> { fn tokenize_multiline_comment( &self, chars: &mut State<'a>, - ) -> Result, TokenizerError> { + ) -> Result>, TokenizerError> { let s = self.tokenize_multiline_comment_borrowed(chars)?; - Ok(Some(Token::Whitespace(Whitespace::MultiLineComment( - s.to_string(), - )))) + Ok(Some(BorrowedToken::Whitespace( + Whitespace::MultiLineComment(Cow::Borrowed(s)), + ))) } /// Tokenize a multi-line comment, returning a borrowed slice. @@ -2541,9 +2625,9 @@ impl<'a> Tokenizer<'a> { #[allow(clippy::unnecessary_wraps)] fn consume_and_return( &self, - chars: &mut State, - t: Token, - ) -> Result, TokenizerError> { + chars: &mut State<'a>, + t: BorrowedToken<'a>, + ) -> Result>, TokenizerError> { chars.next(); Ok(Some(t)) } @@ -3062,12 +3146,12 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Word(Word { - value: "foo".to_string(), + value: "foo".to_string().into(), quote_style: None, keyword: Keyword::NoKeyword, }), Token::DoubleEq, - Token::SingleQuotedString("1".to_string()), + Token::SingleQuotedString("1".to_string().into()), ]; compare(expected, tokens); @@ -3169,11 +3253,11 @@ mod tests { let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), - Token::SingleQuotedString(String::from("a")), + Token::SingleQuotedString(String::from("a").into()), Token::Whitespace(Whitespace::Space), Token::StringConcat, Token::Whitespace(Whitespace::Space), - Token::SingleQuotedString(String::from("b")), + Token::SingleQuotedString(String::from("b").into()), ]; compare(expected, tokens); @@ -3352,7 +3436,7 @@ mod tests { Token::Whitespace(Whitespace::Space), Token::Neq, Token::Whitespace(Whitespace::Space), - Token::SingleQuotedString(String::from("Not Provided")), + Token::SingleQuotedString(String::from("Not Provided").into()), ]; compare(expected, tokens); @@ -3379,7 +3463,9 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())]; + let expected = vec![Token::SingleQuotedString( + "foo\r\nbar\nbaz".to_string().into(), + )]; compare(expected, tokens); } @@ -3669,8 +3755,8 @@ mod tests { vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment\n".to_string(), + prefix: "--".to_string().into(), + comment: "this is a comment\n".to_string().into(), }), Token::Number("1".to_string(), false), ], @@ -3680,8 +3766,8 @@ mod tests { vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment\r1".to_string(), + prefix: "--".to_string().into(), + comment: "this is a comment\r1".to_string().into(), }), ], ), @@ -3690,8 +3776,8 @@ mod tests { vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment\r\n".to_string(), + prefix: "--".to_string().into(), + comment: "this is a comment\r\n".to_string().into(), }), Token::Number("1".to_string(), false), ], @@ -3715,8 +3801,8 @@ mod tests { let expected = vec![ Token::Number("1".to_string(), false), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "\r".to_string(), + prefix: "--".to_string().into(), + comment: "\r".to_string().into(), }), Token::Number("0".to_string(), false), ]; @@ -3730,8 +3816,8 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "this is a comment".to_string(), + prefix: "--".to_string().into(), + comment: "this is a comment".to_string().into(), })]; compare(expected, tokens); } @@ -3745,7 +3831,7 @@ mod tests { let expected = vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::MultiLineComment( - "multi-line\n* /comment".to_string(), + "multi-line\n* /comment".to_string().into(), )), Token::Number("1".to_string(), false), ]; @@ -3764,7 +3850,7 @@ mod tests { Token::Whitespace(Whitespace::Space), Token::Div, Token::Word(Word { - value: "comment".to_string(), + value: "comment".to_string().into(), quote_style: None, keyword: Keyword::COMMENT, }), @@ -3791,7 +3877,9 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), + Token::Whitespace(Whitespace::MultiLineComment( + " a /* b */ c ".to_string().into(), + )), Token::Number("0".to_string(), false), ], ); @@ -3805,7 +3893,7 @@ mod tests { Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), - Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), + Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string().into())), Token::Number("0".to_string(), false), ], ); @@ -3820,7 +3908,7 @@ mod tests { Token::Whitespace(Whitespace::Space), Token::Number("1".to_string(), false), Token::Whitespace(Whitespace::MultiLineComment( - "/* nested comment ".to_string(), + "/* nested comment ".to_string().into(), )), Token::Mul, Token::Div, @@ -3837,7 +3925,9 @@ mod tests { let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let expected = vec![ Token::Whitespace(Whitespace::Newline), - Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())), + Token::Whitespace(Whitespace::MultiLineComment( + "* Comment *".to_string().into(), + )), Token::Whitespace(Whitespace::Newline), ]; compare(expected, tokens); @@ -4221,14 +4311,16 @@ mod tests { .with_unescape(false) .tokenize() .unwrap(); - let expected = vec![Token::SingleQuotedString(expected.to_string())]; + let expected = vec![Token::SingleQuotedString(expected.to_string().into())]; compare(expected, tokens); let tokens = Tokenizer::new(&dialect, sql) .with_unescape(true) .tokenize() .unwrap(); - let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())]; + let expected = vec![Token::SingleQuotedString( + expected_unescaped.to_string().into(), + )]; compare(expected, tokens); } @@ -4245,7 +4337,7 @@ mod tests { let dialect = GenericDialect {}; let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - let expected = vec![Token::SingleQuotedString(expected.to_string())]; + let expected = vec![Token::SingleQuotedString(expected.to_string().into())]; compare(expected, tokens); } @@ -4255,7 +4347,7 @@ mod tests { let dialect = MySqlDialect {}; let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - let expected = vec![Token::SingleQuotedString(expected.to_string())]; + let expected = vec![Token::SingleQuotedString(expected.to_string().into())]; compare(expected, tokens); } @@ -4358,7 +4450,7 @@ mod tests { .unwrap(); let expected = vec![ Token::DoubleQuotedString("".to_string()), - Token::SingleQuotedString("".to_string()), + Token::SingleQuotedString("".to_string().into()), ]; compare(expected, tokens); @@ -4368,7 +4460,7 @@ mod tests { .tokenize() .unwrap(); let expected = vec![ - Token::SingleQuotedString("".to_string()), + Token::SingleQuotedString("".to_string().into()), Token::DoubleQuotedString("".to_string()), ]; compare(expected, tokens); @@ -4377,7 +4469,7 @@ mod tests { let dialect = SnowflakeDialect {}; let sql = r#"''''''"#; let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - let expected = vec![Token::SingleQuotedString("''".to_string())]; + let expected = vec![Token::SingleQuotedString("''".to_string().into())]; compare(expected, tokens); } @@ -4409,7 +4501,7 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::AtSign, - Token::SingleQuotedString("1".to_string()), + Token::SingleQuotedString("1".to_string().into()), ]; compare(expected, tokens); } @@ -4467,7 +4559,7 @@ mod tests { Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::make_word("e", None), - Token::SingleQuotedString("...".to_string()), + Token::SingleQuotedString("...".to_string().into()), ], ); @@ -4477,7 +4569,7 @@ mod tests { Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::make_word("E", None), - Token::SingleQuotedString("...".to_string()), + Token::SingleQuotedString("...".to_string().into()), ], ); } @@ -4513,7 +4605,7 @@ mod tests { Token::Whitespace(Whitespace::Space), Token::Minus, Token::Minus, - Token::SingleQuotedString("abc".to_string()), + Token::SingleQuotedString("abc".to_string().into()), ], ); @@ -4524,8 +4616,8 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: " 'abc'".to_string(), + prefix: "--".to_string().into(), + comment: " 'abc'".to_string().into(), }), ], ); @@ -4551,8 +4643,8 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "'abc'".to_string(), + prefix: "--".to_string().into(), + comment: "'abc'".to_string().into(), }), ], ); @@ -4564,8 +4656,8 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: " 'abc'".to_string(), + prefix: "--".to_string().into(), + comment: " 'abc'".to_string().into(), }), ], ); @@ -4577,8 +4669,8 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "--".to_string(), - comment: "".to_string(), + prefix: "--".to_string().into(), + comment: "".to_string().into(), }), ], ); @@ -4622,13 +4714,13 @@ mod tests { Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Word(Word { - value: "table".to_string(), + value: "table".to_string().into(), quote_style: None, keyword: Keyword::TABLE, }), Token::Period, Token::Word(Word { - value: "_col".to_string(), + value: "_col".to_string().into(), quote_style: None, keyword: Keyword::NoKeyword, }), diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs index 0ef1c4f0..81ae78b2 100644 --- a/tests/sqlparser_bigquery.rs +++ b/tests/sqlparser_bigquery.rs @@ -2629,7 +2629,7 @@ fn test_export_data() { body: Box::new(SetExpr::Select(Box::new(Select { select_token: AttachedToken(TokenWithSpan::new( Token::Word(Word { - value: "SELECT".to_string(), + value: "SELECT".to_string().into(), quote_style: None, keyword: Keyword::SELECT, }), @@ -2733,7 +2733,7 @@ fn test_export_data() { body: Box::new(SetExpr::Select(Box::new(Select { select_token: AttachedToken(TokenWithSpan::new( Token::Word(Word { - value: "SELECT".to_string(), + value: "SELECT".to_string().into(), quote_style: None, keyword: Keyword::SELECT, }), diff --git a/tests/sqlparser_mssql.rs b/tests/sqlparser_mssql.rs index 24937d0a..76d25a2a 100644 --- a/tests/sqlparser_mssql.rs +++ b/tests/sqlparser_mssql.rs @@ -1581,7 +1581,7 @@ fn test_mssql_while_statement() { while_block: ConditionalStatementBlock { start_token: AttachedToken(TokenWithSpan { token: Token::Word(Word { - value: "WHILE".to_string(), + value: "WHILE".to_string().into(), quote_style: None, keyword: Keyword::WHILE }), diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index f187af1b..81cd66d9 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -566,8 +566,8 @@ fn test_snowflake_single_line_tokenize() { Token::Whitespace(Whitespace::Space), Token::make_keyword("TABLE"), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "#".to_string(), - comment: " this is a comment \n".to_string(), + prefix: "#".to_string().into(), + comment: " this is a comment \n".to_string().into(), }), Token::make_word("table_1", None), ]; @@ -583,8 +583,8 @@ fn test_snowflake_single_line_tokenize() { Token::make_keyword("TABLE"), Token::Whitespace(Whitespace::Space), Token::Whitespace(Whitespace::SingleLineComment { - prefix: "//".to_string(), - comment: " this is a comment \n".to_string(), + prefix: "//".to_string().into(), + comment: " this is a comment \n".to_string().into(), }), Token::make_word("table_1", None), ];