商城商品名,搜索成千上万,如何才能实时的获取到当前搜索热度最高的词,或者某一个搜索词排在第几呢?
ClickHouse 是俄罗斯的 Yandex 于 2016 年开源的用于在线分析处理查询(OLAP :Online Analytical Processing)MPP架构的列式存储数据库(DBMS:Database Management System),能够使用 SQL 查询实时生成分析数据报告。ClickHouse的全称是Click Stream,Data WareHouse。
clickhouse可以做用户行为分析,流批一体
线性扩展和可靠性保障能够原生支持 shard + replication
clickhouse没有走hadoop生态,采用 Local attached storage 作为存储
CREATE TABLE `search_log` (
`id` int NOT NULL AUTO_INCREMENT COMMENT 'logId',
`keyword` varchar(255) COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '搜索词',
`createTime` datetime DEFAULT NULL COMMENT '搜索时间',
PRIMARY KEY (`id`),
KEY `createTime` (`createTime`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
package main
import (
"database/sql"
"github.com/brianvoe/gofakeit/v6"
_ "github.com/go-sql-driver/mysql"
"log"
"math/rand"
"time"
)
type SearchLog struct {
Keyword string
CreateTime string
}
func getDb() *sql.DB {
dsn := "test_goods:xSGFwEXdXbY2GhZG@tcp(127.0.0.1:3306)/test_goods?charset=utf8mb4&parseTime=True"
db, err := sql.Open("mysql", dsn)
if err != nil {
panic(err)
}
return db
}
func main() {
for i := 0; i < 100; i++ {
for true {
searchData := make([]SearchLog, 0)
for j := 0; j < 500; j++ {
searchLog := SearchLog{
Keyword: randGoodsName(),
CreateTime: randTime(),
}
searchData = append(searchData, searchLog)
}
saveData(searchData)
searchData = searchData[0:0]
}
}
}
func randGoodsName() string {
// 生成一个长度在 3 到 50 之间的中文商品名称
gofakeit.Seed(time.Now().UnixNano())
name := gofakeit.Name()
for len([]rune(name)) > 50 || len([]rune(name)) < 3 {
name = gofakeit.Name()
}
return name
}
func randTime() string {
// 生成一个 2023 年 1 月 1 日到 2023 年 4 月 30 日之间的随机时间
start := time.Date(2023, 1, 1, 0, 0, 0, 0, time.Local).Unix()
end := time.Date(2023, 4, 30, 23, 59, 59, 0, time.Local).Unix()
randTime := time.Unix(rand.Int63n(end-start)+start, 0)
// 判断当前时间是否在白天(6:00-18:00),如果是,则再随机生成一次时间
if randTime.Hour() >= 6 && randTime.Hour() < 18 {
randTime = time.Unix(rand.Int63n(end-start)+start, 0)
}
return randTime.Format("2006-01-02 15:04:5")
}
func saveData(searchData []SearchLog) {
log.Println("开始保存数据")
db := getDb()
defer db.Close()
// 保存数据
// 插入多条数据
sqlStr := "insert into search_log (keyword,createTime) values"
vals := []interface{}{}
for index, row := range searchData {
if index == len(searchData)-1 {
sqlStr += "(?,?)"
} else {
sqlStr += "(?,?), "
}
vals = append(vals, row.Keyword, row.CreateTime)
}
_, err := db.Exec(sqlStr, vals...) // vals...: 解构
if err != nil {
log.Println("发生错误,重试:" + err.Error())
saveData(searchData)
}
log.Println("1000条处理完成")
}
运行go:
添加 2000万数据测试看看
创建一张mysql来源的表
create table search_log_mysql(
id Nullable(Int32),
keyword Nullable(String),
createTime Nullable(DateTime)
)engine=MySQL('127.0.0.1:3306','test_goods','search_log','test_goods','xSGFwEXdXbY2GhZG');
jx-home :) select count() as num,keyword from search_log_mysql group by keyword order by num desc limit 10
SELECT
count() AS num,
keyword
FROM search_log_mysql
GROUP BY keyword
ORDER BY num DESC
LIMIT 10
Query id: f22cf3bf-743e-441c-ba8e-7b2351c68f10
┌─num─┬─keyword────┐
│ 151 │ Ezra Terry │
│ 150 │ Lupe Davis │
│ 148 │ Mossie Rau │
│ 148 │ Leon Wiza │
│ 147 │ Mike Haley │
│ 146 │ Luis Fahey │
│ 146 │ Oral Boyle │
│ 145 │ May Boyer │
│ 145 │ Petra Orn │
│ 145 │ Gail Doyle │
└─────┴────────────┘
10 rows in set. Elapsed: 7.469 sec. Processed 21.97 million rows, 471.47 MB (2.94 million rows/s., 63.13 MB/s.)
jx-home :)
SELECT
count() AS num,
keyword
FROM search_log_mysql
WHERE (createTime >= '2023-02-01') AND (createTime < '2023-03-01')
GROUP BY keyword
ORDER BY num DESC
LIMIT 10
SELECT
count() AS num,
keyword
FROM search_log_mysql
WHERE (createTime >= '2023-02-01') AND (createTime < '2023-03-01')
GROUP BY keyword
ORDER BY num DESC
LIMIT 10
Query id: eebf52de-7791-4a9f-bdf2-6d4ad99cd908
┌─num─┬─keyword────┐
│ 52 │ Rita Hilll │
│ 49 │ Sage Adams │
│ 48 │ Tod Veum │
│ 48 │ Oral Boyle │
│ 47 │ Isac Olson │
│ 46 │ Jade Yundt │
│ 46 │ Clark Von │
│ 46 │ Neha Ryan │
│ 46 │ Jan Bailey │
│ 45 │ Tyra Hahn │
└─────┴────────────┘
10 rows in set. Elapsed: 7.117 sec. Processed 5.13 million rows, 135.78 MB (721.03 thousand rows/s., 19.08 MB/s.)
首先我们需要知道一个窗口函数
select keyword,
rank() OVER w AS rank,
row_number() OVER w AS row_number,
num
from (
SELECT
count() AS num,
keyword
FROM search_log_mysql
GROUP BY keyword
ORDER BY num DESC
)
WINDOW w AS (ORDER BY num DESC range unbounded preceding) limit 10
// WINDOW w AS (PARTITION BY keyword ORDER BY num DESC range unbounded preceding) limit 10 ####PARTITION BY xxx 可以根据某个字段值进行分组做窗口处理
输出:
再进行包裹一层,获取排名 查询商品Petra Orn 的排名
SELECT
keyword,
row_number
FROM
(
SELECT
keyword,
rank() OVER w AS rank,
row_number() OVER w AS row_number,
num
FROM
(
SELECT
count() AS num,
keyword
FROM search_log_mysql
GROUP BY keyword
ORDER BY num DESC
)
WINDOW w AS (ORDER BY num DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
)
WHERE keyword = 'Petra Orn'