面对千万级数据量的多列统计查询,以下是几种有效的优化策略:
复合索引设计:
ALTER TABLE large_table ADD INDEX idx_stats (group_column, date_column);
覆盖索引利用:
-- 确保查询列都在索引中
SELECT SUM(col1), SUM(col2)
FROM large_table
WHERE group_id = 123 AND date BETWEEN '2023-01-01' AND '2023-12-31';
分批处理:
-- 使用分页或ID范围分批处理
SELECT SUM(col1), SUM(col2)
FROM large_table
WHERE id BETWEEN 1 AND 1000000;
SELECT SUM(col1), SUM(col2)
FROM large_table
WHERE id BETWEEN 1000001 AND 2000000;
条件精确化:
-- 添加尽可能多的过滤条件
SELECT SUM(col1), SUM(col2)
FROM large_table
WHERE date = '2023-01-01'
AND region = 'east'
AND status = 'active';
物化视图/汇总表:
-- 创建预计算表
CREATE TABLE stats_daily (
date DATE,
group_id INT,
sum_col1 DECIMAL(20,2),
sum_col2 DECIMAL(20,2),
PRIMARY KEY (date, group_id)
);
-- 定期更新(如每天凌晨)
INSERT INTO stats_daily (date, group_id, sum_col1, sum_col2)
SELECT
DATE(created_at),
group_id,
SUM(col1),
SUM(col2)
FROM large_table
WHERE created_at >= CURDATE() - INTERVAL 1 DAY
GROUP BY DATE(created_at), group_id
ON DUPLICATE KEY UPDATE
sum_col1 = VALUES(sum_col1),
sum_col2 = VALUES(sum_col2);
-- 按日期范围分区
ALTER TABLE large_table PARTITION BY RANGE (YEAR(date_column)) (
PARTITION p2020 VALUES LESS THAN (2021),
PARTITION p2021 VALUES LESS THAN (2022),
PARTITION p2022 VALUES LESS THAN (2023),
PARTITION p2023 VALUES LESS THAN (2024),
PARTITION pmax VALUES LESS THAN MAXVALUE
);
列式存储引擎:
-- 使用ClickHouse等列式数据库处理分析查询
CREATE TABLE stats_columnar (
id UInt64,
col1 Float64,
col2 Float64,
date Date
) ENGINE = MergeTree()
ORDER BY (date, id);
使用Redis缓存:
# Python示例:使用Redis缓存统计结果
import redis
r = redis.Redis()
cache_key = f"stats:{group_id}:{date}"
cached_result = r.get(cache_key)
if not cached_result:
# 执行数据库查询
result = execute_query(group_id, date)
r.setex(cache_key, 3600, result) # 缓存1小时
-- 检查查询执行计划
EXPLAIN ANALYZE
SELECT SUM(col1), SUM(col2)
FROM large_table
WHERE group_id = 123
AND date BETWEEN '2023-01-01' AND '2023-12-31';
通过组合使用这些策略,可以显著提高千万级数据量下的多列统计查询性能。