datafusion-sqlparse/sqlparser_bench/benches/tokenize_bench.rs
Eyal Leshem 5458a2b21d Implement zero-copy tokenization for Word, SingleQuotedString, and Whitespace
Convert token string fields to use Cow<'a, str> to enable zero-copy tokenization
  for commonly used tokens:
  - Word.value: Regular identifiers and keywords now borrow from source
  - SingleQuotedString: String literals borrow when no escape processing needed
  - Whitespace: Single-line and multi-line comments borrow from source

Also add benchmark for measuring tokenization performance
2025-12-21 16:39:10 +02:00

862 lines
31 KiB
Rust

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Benchmark tokenization performance
//!
//! This benchmark measures tokenization speed using a complex SQL query
//! with many identifiers, keywords, string literals, and comments.
use criterion::{criterion_group, criterion_main, Criterion};
use sqlparser::dialect::GenericDialect;
use sqlparser::tokenizer::Tokenizer;
const COMPLEX_SQL: &str = r#"
-- ============================================================================
-- Enterprise Sales Analytics Dashboard Query
-- ============================================================================
-- Purpose: Comprehensive sales analysis across multiple dimensions
-- Author: Analytics Team
-- Last Modified: 2024-01-15
-- ============================================================================
/*
* This query aggregates sales data from multiple sources:
* - Customer transactions and lifetime value
* - Product performance across categories
* - Regional sales trends and patterns
* - Employee commission calculations
* - Inventory and fulfillment metrics
*/
WITH customer_segments AS (
-- Segment customers by purchase behavior and demographics
SELECT
customer_id,
customer_number,
customer_name,
customer_type,
customer_status,
customer_tier,
email_address,
phone_number,
mobile_number,
fax_number,
date_of_birth,
registration_date,
last_login_date,
account_status,
email_verified,
phone_verified,
-- Address information
billing_address_line1,
billing_address_line2,
billing_city,
billing_state,
billing_postal_code,
billing_country,
shipping_address_line1,
shipping_address_line2,
shipping_city,
shipping_state,
shipping_postal_code,
shipping_country,
-- Demographics
gender,
age_group,
income_bracket,
education_level,
occupation,
marital_status,
-- Marketing preferences
marketing_opt_in,
sms_opt_in,
email_frequency,
preferred_channel,
preferred_language,
-- Calculated fields
CASE
WHEN customer_status = 'active' AND last_login_date >= CURRENT_DATE - INTERVAL '30' DAY THEN 'highly_active'
WHEN customer_status = 'active' AND last_login_date >= CURRENT_DATE - INTERVAL '90' DAY THEN 'active'
WHEN customer_status = 'active' THEN 'inactive'
ELSE 'dormant'
END AS activity_level,
CASE
WHEN registration_date >= CURRENT_DATE - INTERVAL '1' YEAR THEN 'new'
WHEN registration_date >= CURRENT_DATE - INTERVAL '3' YEAR THEN 'established'
ELSE 'veteran'
END AS customer_tenure
FROM customers
WHERE customer_status IN ('active', 'pending', 'suspended')
AND registration_date >= '2020-01-01'
AND billing_country IN ('USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Spain', 'Italy')
AND email_address NOT LIKE '%@test.com'
AND email_address NOT LIKE '%@example.com'
AND customer_name IS NOT NULL
),
product_catalog AS (
-- Product information with categories and attributes
SELECT
product_id,
product_sku,
product_name,
product_description,
product_category,
product_subcategory,
product_brand,
product_manufacturer,
product_supplier,
product_model,
product_series,
product_version,
-- Pricing
list_price,
cost_price,
sale_price,
wholesale_price,
minimum_price,
suggested_retail_price,
-- Attributes
product_color,
product_size,
product_weight,
product_length,
product_width,
product_height,
product_material,
product_warranty,
-- Inventory
stock_quantity,
reorder_level,
reorder_quantity,
warehouse_location,
bin_location,
aisle_number,
shelf_number,
-- Status
product_status,
availability_status,
is_featured,
is_new_arrival,
is_on_sale,
is_clearance,
is_discontinued,
launch_date,
discontinuation_date,
-- Ratings
average_rating,
review_count,
return_rate,
defect_rate,
-- Categories
CASE
WHEN product_category = 'electronics' THEN 'high_tech'
WHEN product_category IN ('clothing', 'shoes', 'accessories') THEN 'fashion'
WHEN product_category IN ('home', 'garden', 'furniture') THEN 'home_living'
WHEN product_category IN ('sports', 'outdoor', 'fitness') THEN 'active_lifestyle'
ELSE 'general_merchandise'
END AS category_group
FROM products
WHERE product_status = 'active'
AND availability_status IN ('in_stock', 'low_stock', 'backorder')
AND is_discontinued = FALSE
AND launch_date <= CURRENT_DATE
),
order_transactions AS (
-- Order and transaction details
SELECT
order_id,
order_number,
order_date,
order_time,
order_timestamp,
customer_id,
order_status,
order_type,
order_channel,
order_source,
-- Payment information
payment_method,
payment_status,
payment_date,
payment_reference,
transaction_id,
authorization_code,
-- Financial details
subtotal_amount,
tax_amount,
shipping_amount,
discount_amount,
coupon_amount,
gift_card_amount,
total_amount,
paid_amount,
refund_amount,
net_amount,
-- Shipping details
shipping_method,
shipping_carrier,
tracking_number,
shipped_date,
estimated_delivery_date,
actual_delivery_date,
delivery_status,
signature_required,
-- Location
ship_to_address_line1,
ship_to_address_line2,
ship_to_city,
ship_to_state,
ship_to_postal_code,
ship_to_country,
-- Fulfillment
warehouse_id,
fulfillment_center,
picker_id,
packer_id,
shipper_id,
-- Timestamps
created_at,
updated_at,
completed_at,
cancelled_at,
-- Flags
is_gift,
is_rush_order,
is_international,
requires_signature,
is_business_order,
-- Notes
customer_notes,
internal_notes,
gift_message,
special_instructions
FROM orders
WHERE order_date >= '2023-01-01'
AND order_date < '2024-12-31'
AND order_status IN ('pending', 'processing', 'shipped', 'delivered', 'completed')
AND order_type IN ('standard', 'express', 'overnight', 'international')
AND total_amount > 0
AND customer_id IS NOT NULL
),
order_line_items AS (
-- Individual line items from orders
SELECT
line_item_id,
order_id,
product_id,
line_number,
-- Quantities
quantity_ordered,
quantity_shipped,
quantity_cancelled,
quantity_returned,
quantity_damaged,
-- Pricing
unit_price,
unit_cost,
unit_discount,
line_subtotal,
line_tax,
line_shipping,
line_total,
-- Discounts
discount_type,
discount_code,
discount_percentage,
discount_reason,
-- Product details at time of order
product_sku_snapshot,
product_name_snapshot,
product_category_snapshot,
-- Status
line_status,
fulfillment_status,
return_status,
-- Warehouse
picked_from_warehouse,
picked_from_location,
picked_by_user,
picked_at_timestamp,
packed_by_user,
packed_at_timestamp,
-- Returns
return_reason,
return_date,
refund_amount,
restocking_fee,
-- Gift wrap
is_gift_wrapped,
gift_wrap_type,
gift_wrap_charge,
-- Calculated fields
unit_price * quantity_ordered AS line_revenue,
unit_cost * quantity_ordered AS line_cost,
(unit_price - unit_cost) * quantity_ordered AS line_profit,
CASE
WHEN quantity_returned > 0 THEN 'returned'
WHEN quantity_cancelled > 0 THEN 'cancelled'
WHEN quantity_shipped = quantity_ordered THEN 'fulfilled'
ELSE 'partial'
END AS fulfillment_type
FROM order_items
WHERE line_status NOT IN ('cancelled', 'voided')
AND quantity_ordered > 0
),
employee_data AS (
-- Employee and sales representative information
SELECT
employee_id,
employee_number,
employee_name,
first_name,
last_name,
middle_name,
email_address,
phone_extension,
mobile_phone,
-- Employment details
hire_date,
termination_date,
employment_status,
employment_type,
job_title,
job_level,
job_grade,
department_id,
department_name,
division_id,
division_name,
-- Management
manager_id,
manager_name,
reports_to,
-- Location
office_location,
office_building,
office_floor,
office_room,
work_city,
work_state,
work_country,
-- Compensation
base_salary,
commission_rate,
bonus_target,
commission_tier,
-- Performance
sales_quota,
current_sales,
quota_attainment,
performance_rating,
last_review_date,
next_review_date
FROM employees
WHERE employment_status = 'active'
AND employee_id IS NOT NULL
AND hire_date <= CURRENT_DATE
),
customer_lifetime_metrics AS (
-- Calculate customer lifetime value and metrics
SELECT
cs.customer_id,
cs.customer_name,
cs.customer_tier,
cs.activity_level,
-- Order counts
COUNT(DISTINCT ot.order_id) AS total_orders,
COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '30' DAY THEN ot.order_id END) AS orders_last_30_days,
COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '90' DAY THEN ot.order_id END) AS orders_last_90_days,
COUNT(DISTINCT CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.order_id END) AS orders_last_year,
-- Revenue metrics
SUM(ot.total_amount) AS lifetime_revenue,
SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '30' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_30_days,
SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '90' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_90_days,
SUM(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.total_amount ELSE 0 END) AS revenue_last_year,
-- Average values
AVG(ot.total_amount) AS average_order_value,
AVG(CASE WHEN ot.order_date >= CURRENT_DATE - INTERVAL '365' DAY THEN ot.total_amount END) AS avg_order_value_last_year,
-- Product metrics
COUNT(DISTINCT oli.product_id) AS unique_products_purchased,
SUM(oli.quantity_ordered) AS total_items_purchased,
-- Return metrics
SUM(oli.quantity_returned) AS total_items_returned,
SUM(CASE WHEN oli.quantity_returned > 0 THEN oli.refund_amount ELSE 0 END) AS total_refund_amount,
-- Date ranges
MIN(ot.order_date) AS first_order_date,
MAX(ot.order_date) AS last_order_date,
MAX(ot.order_date) - MIN(ot.order_date) AS customer_lifespan_days,
-- Recency
CURRENT_DATE - MAX(ot.order_date) AS days_since_last_order
FROM customer_segments cs
LEFT JOIN order_transactions ot ON cs.customer_id = ot.customer_id
LEFT JOIN order_line_items oli ON ot.order_id = oli.order_id
WHERE ot.order_status IN ('delivered', 'completed')
GROUP BY
cs.customer_id,
cs.customer_name,
cs.customer_tier,
cs.activity_level
),
product_performance AS (
-- Product sales performance metrics
SELECT
pc.product_id,
pc.product_sku,
pc.product_name,
pc.product_category,
pc.product_subcategory,
pc.product_brand,
pc.category_group,
-- Sales metrics
COUNT(DISTINCT oli.order_id) AS total_orders,
SUM(oli.quantity_ordered) AS total_quantity_sold,
SUM(oli.quantity_returned) AS total_quantity_returned,
SUM(oli.line_revenue) AS total_revenue,
SUM(oli.line_cost) AS total_cost,
SUM(oli.line_profit) AS total_profit,
-- Averages
AVG(oli.unit_price) AS average_selling_price,
AVG(oli.line_revenue) AS average_line_revenue,
-- Return rate
CAST(SUM(oli.quantity_returned) AS DECIMAL) / NULLIF(SUM(oli.quantity_ordered), 0) AS return_rate,
-- Profit margin
CAST(SUM(oli.line_profit) AS DECIMAL) / NULLIF(SUM(oli.line_revenue), 0) AS profit_margin,
-- Rankings
RANK() OVER (PARTITION BY pc.product_category ORDER BY SUM(oli.line_revenue) DESC) AS revenue_rank_in_category,
RANK() OVER (ORDER BY SUM(oli.quantity_ordered) DESC) AS quantity_rank_overall
FROM product_catalog pc
INNER JOIN order_line_items oli ON pc.product_id = oli.product_id
INNER JOIN order_transactions ot ON oli.order_id = ot.order_id
WHERE ot.order_status IN ('delivered', 'completed')
AND ot.order_date >= '2023-01-01'
GROUP BY
pc.product_id,
pc.product_sku,
pc.product_name,
pc.product_category,
pc.product_subcategory,
pc.product_brand,
pc.category_group
),
regional_sales AS (
-- Sales performance by region
SELECT
cs.billing_country,
cs.billing_state,
cs.billing_city,
-- Order metrics
COUNT(DISTINCT ot.order_id) AS total_orders,
COUNT(DISTINCT cs.customer_id) AS unique_customers,
-- Revenue
SUM(ot.total_amount) AS total_revenue,
SUM(ot.shipping_amount) AS total_shipping_revenue,
SUM(ot.tax_amount) AS total_tax_collected,
AVG(ot.total_amount) AS average_order_value,
-- Time periods
SUM(CASE WHEN ot.order_date >= '2024-01-01' THEN ot.total_amount ELSE 0 END) AS revenue_2024,
SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END) AS revenue_2023,
-- Growth
(SUM(CASE WHEN ot.order_date >= '2024-01-01' THEN ot.total_amount ELSE 0 END) -
SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END)) /
NULLIF(SUM(CASE WHEN ot.order_date >= '2023-01-01' AND ot.order_date < '2024-01-01' THEN ot.total_amount ELSE 0 END), 0) AS year_over_year_growth
FROM customer_segments cs
INNER JOIN order_transactions ot ON cs.customer_id = ot.customer_id
WHERE ot.order_status IN ('delivered', 'completed')
GROUP BY
cs.billing_country,
cs.billing_state,
cs.billing_city
HAVING SUM(ot.total_amount) > 1000
),
monthly_trends AS (
-- Monthly sales trends and seasonality
SELECT
DATE_TRUNC('month', ot.order_date) AS order_month,
EXTRACT(YEAR FROM ot.order_date) AS order_year,
EXTRACT(MONTH FROM ot.order_date) AS month_number,
EXTRACT(QUARTER FROM ot.order_date) AS quarter_number,
-- Volume metrics
COUNT(DISTINCT ot.order_id) AS orders,
COUNT(DISTINCT ot.customer_id) AS customers,
SUM(oli.quantity_ordered) AS items_sold,
-- Financial metrics
SUM(ot.subtotal_amount) AS subtotal,
SUM(ot.tax_amount) AS tax,
SUM(ot.shipping_amount) AS shipping,
SUM(ot.discount_amount) AS discounts,
SUM(ot.total_amount) AS revenue,
-- Averages
AVG(ot.total_amount) AS avg_order_value,
AVG(oli.quantity_ordered) AS avg_items_per_order,
-- Moving averages
AVG(SUM(ot.total_amount)) OVER (ORDER BY DATE_TRUNC('month', ot.order_date) ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS three_month_moving_avg,
AVG(SUM(ot.total_amount)) OVER (ORDER BY DATE_TRUNC('month', ot.order_date) ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS six_month_moving_avg
FROM order_transactions ot
INNER JOIN order_line_items oli ON ot.order_id = oli.order_id
WHERE ot.order_status IN ('delivered', 'completed')
AND ot.order_date >= '2022-01-01'
GROUP BY
DATE_TRUNC('month', ot.order_date),
EXTRACT(YEAR FROM ot.order_date),
EXTRACT(MONTH FROM ot.order_date),
EXTRACT(QUARTER FROM ot.order_date)
),
category_analysis AS (
-- Category performance analysis
SELECT
pc.product_category,
pc.product_subcategory,
pc.category_group,
-- Sales
COUNT(DISTINCT oli.order_id) AS orders,
SUM(oli.quantity_ordered) AS quantity,
SUM(oli.line_revenue) AS revenue,
SUM(oli.line_profit) AS profit,
-- Market share
SUM(oli.line_revenue) / SUM(SUM(oli.line_revenue)) OVER () AS revenue_share,
-- Pricing
AVG(oli.unit_price) AS avg_price,
MIN(oli.unit_price) AS min_price,
MAX(oli.unit_price) AS max_price,
-- Profitability
SUM(oli.line_profit) / NULLIF(SUM(oli.line_revenue), 0) AS profit_margin,
-- Returns
SUM(oli.quantity_returned) AS returns,
CAST(SUM(oli.quantity_returned) AS DECIMAL) / NULLIF(SUM(oli.quantity_ordered), 0) AS return_rate
FROM product_catalog pc
INNER JOIN order_line_items oli ON pc.product_id = oli.product_id
INNER JOIN order_transactions ot ON oli.order_id = ot.order_id
WHERE ot.order_status IN ('delivered', 'completed')
GROUP BY
pc.product_category,
pc.product_subcategory,
pc.category_group
)
-- Main query combining all CTEs
SELECT
-- Customer information
cs.customer_id,
cs.customer_number,
cs.customer_name,
cs.customer_type,
cs.customer_tier,
cs.activity_level,
cs.customer_tenure,
cs.email_address,
cs.phone_number,
cs.billing_city,
cs.billing_state,
cs.billing_country,
cs.age_group,
cs.gender,
cs.income_bracket,
-- Customer metrics
clm.total_orders,
clm.orders_last_30_days,
clm.orders_last_90_days,
clm.orders_last_year,
clm.lifetime_revenue,
clm.revenue_last_30_days,
clm.revenue_last_90_days,
clm.revenue_last_year,
clm.average_order_value,
clm.unique_products_purchased,
clm.total_items_purchased,
clm.total_items_returned,
clm.first_order_date,
clm.last_order_date,
clm.days_since_last_order,
-- Order details
ot.order_id,
ot.order_number,
ot.order_date,
ot.order_status,
ot.order_type,
ot.order_channel,
ot.payment_method,
ot.payment_status,
ot.subtotal_amount,
ot.tax_amount,
ot.shipping_amount,
ot.discount_amount,
ot.total_amount,
ot.shipping_method,
ot.shipping_carrier,
ot.tracking_number,
ot.delivery_status,
-- Line item details
oli.line_item_id,
oli.product_id,
oli.quantity_ordered,
oli.quantity_shipped,
oli.unit_price,
oli.line_total,
oli.discount_type,
oli.line_status,
-- Product information
pc.product_sku,
pc.product_name,
pc.product_category,
pc.product_subcategory,
pc.product_brand,
pc.product_manufacturer,
pc.category_group,
pc.list_price,
pc.product_color,
pc.product_size,
pc.average_rating,
pc.review_count,
-- Product performance
pp.total_quantity_sold AS product_total_quantity_sold,
pp.total_revenue AS product_total_revenue,
pp.total_profit AS product_total_profit,
pp.return_rate AS product_return_rate,
pp.profit_margin AS product_profit_margin,
pp.revenue_rank_in_category,
-- Employee information
ed.employee_id,
ed.employee_name,
ed.job_title,
ed.department_name,
ed.office_location,
ed.commission_rate,
ed.sales_quota,
-- Regional metrics
rs.total_orders AS region_total_orders,
rs.unique_customers AS region_unique_customers,
rs.total_revenue AS region_total_revenue,
rs.average_order_value AS region_avg_order_value,
rs.year_over_year_growth AS region_yoy_growth,
-- Category metrics
ca.revenue AS category_revenue,
ca.profit AS category_profit,
ca.revenue_share AS category_revenue_share,
ca.profit_margin AS category_profit_margin,
ca.return_rate AS category_return_rate,
-- Monthly trends
mt.order_month,
mt.three_month_moving_avg,
mt.six_month_moving_avg,
-- Calculated fields
CASE
WHEN clm.lifetime_revenue > 10000 THEN 'vip'
WHEN clm.lifetime_revenue > 5000 THEN 'premium'
WHEN clm.lifetime_revenue > 1000 THEN 'standard'
ELSE 'basic'
END AS calculated_tier,
CASE
WHEN clm.days_since_last_order <= 30 THEN 'very_recent'
WHEN clm.days_since_last_order <= 90 THEN 'recent'
WHEN clm.days_since_last_order <= 180 THEN 'moderate'
ELSE 'at_risk'
END AS recency_segment,
CASE
WHEN clm.total_orders >= 50 THEN 'frequent'
WHEN clm.total_orders >= 20 THEN 'regular'
WHEN clm.total_orders >= 5 THEN 'occasional'
ELSE 'rare'
END AS frequency_segment,
oli.unit_price * oli.quantity_ordered AS calculated_line_revenue,
(oli.unit_price * oli.quantity_ordered) * (ed.commission_rate / 100) AS calculated_commission,
ROUND(oli.unit_price * oli.quantity_ordered * 0.9, 2) AS discounted_line_total,
-- Window functions
ROW_NUMBER() OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date DESC) AS order_recency_rank,
RANK() OVER (PARTITION BY cs.billing_country ORDER BY clm.lifetime_revenue DESC) AS customer_value_rank_in_country,
DENSE_RANK() OVER (PARTITION BY pc.product_category ORDER BY oli.quantity_ordered DESC) AS product_popularity_rank,
SUM(ot.total_amount) OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_customer_revenue,
AVG(ot.total_amount) OVER (PARTITION BY cs.customer_id ORDER BY ot.order_date ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS rolling_5_order_avg,
-- Aggregates
SUM(oli.quantity_ordered) OVER (PARTITION BY pc.product_category) AS category_total_quantity,
COUNT(DISTINCT ot.order_id) OVER (PARTITION BY cs.billing_country, DATE_TRUNC('month', ot.order_date)) AS monthly_orders_in_country,
MAX(ot.total_amount) OVER (PARTITION BY cs.customer_id) AS customer_largest_order,
MIN(ot.order_date) OVER (PARTITION BY pc.product_id) AS product_first_sale_date
FROM customer_segments cs
INNER JOIN customer_lifetime_metrics clm ON cs.customer_id = clm.customer_id
INNER JOIN order_transactions ot ON cs.customer_id = ot.customer_id
INNER JOIN order_line_items oli ON ot.order_id = oli.order_id
INNER JOIN product_catalog pc ON oli.product_id = pc.product_id
INNER JOIN product_performance pp ON pc.product_id = pp.product_id
LEFT JOIN employee_data ed ON ot.order_id IN (
SELECT order_id FROM employee_assignments WHERE employee_id = ed.employee_id
)
LEFT JOIN regional_sales rs ON cs.billing_country = rs.billing_country
AND cs.billing_state = rs.billing_state
AND cs.billing_city = rs.billing_city
LEFT JOIN category_analysis ca ON pc.product_category = ca.product_category
AND pc.product_subcategory = ca.product_subcategory
LEFT JOIN monthly_trends mt ON DATE_TRUNC('month', ot.order_date) = mt.order_month
WHERE
-- Date filters
ot.order_date >= '2023-01-01'
AND ot.order_date < '2024-12-31'
-- Status filters
AND ot.order_status IN ('processing', 'shipped', 'delivered', 'completed')
AND oli.line_status NOT IN ('cancelled', 'voided', 'rejected')
AND cs.customer_status = 'active'
AND pc.product_status = 'active'
-- Geographic filters
AND cs.billing_country IN ('USA', 'Canada', 'Mexico', 'UK', 'Germany', 'France', 'Spain', 'Italy', 'Japan', 'Australia')
AND cs.billing_state NOT IN ('test', 'demo', 'internal')
-- Category filters
AND pc.product_category IN ('electronics', 'clothing', 'home', 'sports', 'books', 'toys', 'automotive', 'health', 'beauty', 'grocery')
AND pc.product_subcategory NOT LIKE '%test%'
-- Amount filters
AND ot.total_amount > 0
AND ot.total_amount < 100000
AND oli.quantity_ordered > 0
AND oli.unit_price > 0
-- Quality filters
AND cs.email_address NOT LIKE '%@test.com'
AND cs.email_address NOT LIKE '%@example.com'
AND cs.email_address NOT LIKE '%@invalid.com'
AND cs.customer_name NOT LIKE '%test%'
AND cs.customer_name NOT LIKE '%demo%'
AND pc.product_name NOT LIKE '%sample%'
AND pc.product_name NOT LIKE '%demo%'
-- Tier filters
AND cs.customer_tier IN ('gold', 'silver', 'bronze', 'platinum')
AND cs.activity_level IN ('highly_active', 'active')
-- Payment filters
AND ot.payment_status = 'completed'
AND ot.payment_method IN ('credit_card', 'debit_card', 'paypal', 'apple_pay', 'google_pay', 'bank_transfer')
-- Shipping filters
AND ot.delivery_status IN ('delivered', 'in_transit', 'out_for_delivery')
AND ot.shipping_method IN ('standard', 'express', 'overnight', 'two_day')
-- Channel filters
AND ot.order_channel IN ('web', 'mobile', 'tablet', 'phone', 'store', 'marketplace')
-- Null checks
AND cs.customer_id IS NOT NULL
AND ot.order_id IS NOT NULL
AND oli.product_id IS NOT NULL
AND pc.product_sku IS NOT NULL
AND ot.total_amount IS NOT NULL
GROUP BY
cs.customer_id, cs.customer_number, cs.customer_name, cs.customer_type, cs.customer_tier,
cs.activity_level, cs.customer_tenure, cs.email_address, cs.phone_number,
cs.billing_city, cs.billing_state, cs.billing_country, cs.age_group, cs.gender, cs.income_bracket,
clm.total_orders, clm.orders_last_30_days, clm.orders_last_90_days, clm.orders_last_year,
clm.lifetime_revenue, clm.revenue_last_30_days, clm.revenue_last_90_days, clm.revenue_last_year,
clm.average_order_value, clm.unique_products_purchased, clm.total_items_purchased,
clm.total_items_returned, clm.first_order_date, clm.last_order_date, clm.days_since_last_order,
ot.order_id, ot.order_number, ot.order_date, ot.order_status, ot.order_type, ot.order_channel,
ot.payment_method, ot.payment_status, ot.subtotal_amount, ot.tax_amount, ot.shipping_amount,
ot.discount_amount, ot.total_amount, ot.shipping_method, ot.shipping_carrier, ot.tracking_number,
ot.delivery_status, oli.line_item_id, oli.product_id, oli.quantity_ordered, oli.quantity_shipped,
oli.unit_price, oli.line_total, oli.discount_type, oli.line_status,
pc.product_sku, pc.product_name, pc.product_category, pc.product_subcategory, pc.product_brand,
pc.product_manufacturer, pc.category_group, pc.list_price, pc.product_color, pc.product_size,
pc.average_rating, pc.review_count, pp.total_quantity_sold, pp.total_revenue, pp.total_profit,
pp.return_rate, pp.profit_margin, pp.revenue_rank_in_category,
ed.employee_id, ed.employee_name, ed.job_title, ed.department_name, ed.office_location,
ed.commission_rate, ed.sales_quota, rs.total_orders, rs.unique_customers, rs.total_revenue,
rs.average_order_value, rs.year_over_year_growth,
ca.revenue, ca.profit, ca.revenue_share, ca.profit_margin, ca.return_rate,
mt.order_month, mt.three_month_moving_avg, mt.six_month_moving_avg
HAVING
SUM(oli.quantity_ordered) > 0
AND SUM(oli.line_total) > 0
AND COUNT(DISTINCT ot.order_id) >= 1
ORDER BY
clm.lifetime_revenue DESC,
clm.total_orders DESC,
ot.order_date DESC,
cs.customer_name ASC,
pc.product_category ASC,
pc.product_name ASC,
oli.line_number ASC,
ot.order_id ASC
LIMIT 100000
OFFSET 0;
-- Additional analytics queries for dashboard
-- Top customers by revenue
SELECT
customer_id,
customer_name,
customer_tier,
total_orders,
lifetime_revenue,
average_order_value,
days_since_last_order
FROM customer_lifetime_metrics
WHERE lifetime_revenue > 1000
ORDER BY lifetime_revenue DESC
LIMIT 100;
-- Top products by sales
SELECT
product_sku,
product_name,
product_category,
product_brand,
total_quantity_sold,
total_revenue,
total_profit,
profit_margin,
return_rate
FROM product_performance
WHERE total_revenue > 5000
ORDER BY total_revenue DESC
LIMIT 50;
-- Regional performance summary
SELECT
billing_country,
billing_state,
total_orders,
unique_customers,
total_revenue,
average_order_value,
year_over_year_growth
FROM regional_sales
WHERE total_revenue > 10000
ORDER BY total_revenue DESC;
"#;
fn tokenization_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("tokenization");
let dialect = GenericDialect {};
group.bench_function("tokenize_complex_sql", |b| {
b.iter(|| {
let mut tokenizer = Tokenizer::new(&dialect, COMPLEX_SQL);
tokenizer.tokenize().unwrap()
});
});
group.finish();
}
criterion_group!(benches, tokenization_benchmark);
criterion_main!(benches);