t-digest
t-digest 是一种概率数据结构,可用于估计数据流的百分位数。
t-digest 是 Redis Stack 中的一种草图数据结构,用于使用紧凑的草图从数据流或大型数据集估算百分位数。
它可以回答以下问题
- 数据流中哪些分数的值小于给定值?
- 数据流中有多少个值小于给定值?
- 数据流中小于数据流中 p 百分比的值的最大值是多少?(p 百分位数是多少?)
什么是 t-digest?
t-digest 是一种数据结构,它可以估算百分位点,而无需存储和排序集合中的所有数据点。例如:要回答“99% 的数据库操作的平均延迟是多少”这个问题,我们必须存储每个用户的平均延迟,对值进行排序,剔除最后 1%,然后才能找到所有其余值的平均值。这种处理过程不仅在对这些值进行排序所需的处理方面代价高昂,而且在存储它们所需的存储空间方面代价也高昂。这些正是 t-digest 解决的问题。
t-digest 还可以用于估算与百分位数相关的其他值,例如修剪均值。
修剪均值是草图的平均值,不包括低截断百分位数和高截断百分位数之外的观测值。例如,0.1 修剪均值是草图的平均值,不包括最低 10% 和最高 10% 的值。
用例
硬件/软件监控
您测量在线服务器响应延迟,并且您希望查询
在线游戏
数百万人在您的在线游戏平台上玩游戏,您想向每个玩家提供以下信息?
网络流量监控
您每秒测量通过网络传输的 IP 数据包,并尝试通过询问来检测拒绝服务攻击
预测性维护
示例
在以下示例中,您将创建一个压缩率为 100 的 t-digest 并向其中添加项目。COMPRESSION
参数用于指定准确性和内存消耗之间的权衡。默认值为 100。值越高,准确性越高。注意:与其他一些概率数据结构不同,TDIGEST.ADD
命令在键不存在时不会创建新结构。
> TDIGEST.CREATE bikes:sales COMPRESSION 100
OK
> TDIGEST.ADD bikes:sales 21
OK
> TDIGEST.ADD bikes:sales 150 95 75 34
OK
package io.redis.examples;
public class TDigestExample {
public void run(){
UnifiedJedis unifiedJedis = new UnifiedJedis("redis://127.0.0.1:6379");
String res1 = unifiedJedis.tdigestCreate("bikes:sales", 100);
System.out.println(res1); // >>> True
String res2 = unifiedJedis.tdigestAdd("bikes:sales", 21);
System.out.println(res2); // >>> OK
String res3 = unifiedJedis.tdigestAdd("bikes:sales", 150, 95, 75, 34);
System.out.println(res3); // >>> OK
String res4 = unifiedJedis.tdigestCreate("racer_ages");
System.out.println(res4); // >>> True
String res5 = unifiedJedis.tdigestAdd("racer_ages", 45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63);
System.out.println(res5); // >>> OK
List<Long> res6 = unifiedJedis.tdigestRank("racer_ages", 50);
System.out.println(res6); // >>> [7]
List<Long> res7 = unifiedJedis.tdigestRank("racer_ages", 50, 40);
System.out.println(res7); // >>> [7, 4]
List<Double> res8 = unifiedJedis.tdigestQuantile("racer_ages", 0.5);
System.out.println(res8); // >>> [44.2]
List<Double> res9 = unifiedJedis.tdigestByRank("racer_ages", 4);
System.out.println(res9); // >>> [42.63]
double res10 = unifiedJedis.tdigestMin("racer_ages");
System.out.println(res10); // >>> 19.27
double res11 = unifiedJedis.tdigestMax("racer_ages");
System.out.println(res11); // >>> 85.71
String res12 = unifiedJedis.tdigestReset("racer_ages");
System.out.println(res12); // >>> OK
}
}
import redis
r = redis.Redis(decode_responses=True)
res1 = r.tdigest().create("bikes:sales", 100)
print(res1) # >>> True
res2 = r.tdigest().add("bikes:sales", [21])
print(res2) # >>> OK
res3 = r.tdigest().add("bikes:sales", [150, 95, 75, 34])
print(res3) # >>> OK
res4 = r.tdigest().create("racer_ages")
print(res4) # >>> True
res5 = r.tdigest().add(
"racer_ages",
[
45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63,
],
)
print(res5) # >>> OK
res6 = r.tdigest().rank("racer_ages", 50)
print(res6) # >>> [7]
res7 = r.tdigest().rank("racer_ages", 50, 40)
print(res7) # >>> [7, 4]
res8 = r.tdigest().quantile("racer_ages", 0.5)
print(res8) # >>> [44.2]
res9 = r.tdigest().byrank("racer_ages", 4)
print(res9) # >>> [42.63]
res10 = r.tdigest().min("racer_ages")
print(res10) # >>> 19.27
res11 = r.tdigest().max("racer_ages")
print(res11) # >>> 85.71
res12 = r.tdigest().reset("racer_ages")
print(res12) # >>> OK
每当有新观测值时,你可以重复调用 TDIGEST.ADD
通过值估计分数或等级
t-digest 的另一个有用的功能是 CDF(等级定义),它为我们提供了小于或等于某个值的观测值分数。此命令非常有助于回答诸如“观测值低于或等于 X 的百分比是多少”之类的问题。
更准确地说,TDIGEST.CDF
将返回草图中小于 X 的观测值的估计分数,加上等于 X 的观测值数量的一半。我们还可以使用 TDIGEST.RANK
命令,它非常相似。它不返回分数,而是返回值的 ----估计---- 等级。TDIGEST.RANK
命令也是可变参数的,这意味着你可以使用单个命令来检索一个或多个值的估计值。
这里有一个示例。给定一组骑自行车者的年龄,你可以提出这样的问题:“低于 50 岁的自行车赛车手的百分比是多少?”
> TDIGEST.CREATE racer_ages
OK
> TDIGEST.ADD racer_ages 45.88 44.2 58.03 19.76 39.84 69.28 50.97 25.41 19.27 85.71 42.63
OK
> TDIGEST.CDF racer_ages 50
1) "0.63636363636363635"
> TDIGEST.RANK racer_ages 50
1) (integer) 7
> TDIGEST.RANK racer_ages 50 40
1) (integer) 7
2) (integer) 4
package io.redis.examples;
public class TDigestExample {
public void run(){
UnifiedJedis unifiedJedis = new UnifiedJedis("redis://127.0.0.1:6379");
String res1 = unifiedJedis.tdigestCreate("bikes:sales", 100);
System.out.println(res1); // >>> True
String res2 = unifiedJedis.tdigestAdd("bikes:sales", 21);
System.out.println(res2); // >>> OK
String res3 = unifiedJedis.tdigestAdd("bikes:sales", 150, 95, 75, 34);
System.out.println(res3); // >>> OK
String res4 = unifiedJedis.tdigestCreate("racer_ages");
System.out.println(res4); // >>> True
String res5 = unifiedJedis.tdigestAdd("racer_ages", 45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63);
System.out.println(res5); // >>> OK
List<Long> res6 = unifiedJedis.tdigestRank("racer_ages", 50);
System.out.println(res6); // >>> [7]
List<Long> res7 = unifiedJedis.tdigestRank("racer_ages", 50, 40);
System.out.println(res7); // >>> [7, 4]
List<Double> res8 = unifiedJedis.tdigestQuantile("racer_ages", 0.5);
System.out.println(res8); // >>> [44.2]
List<Double> res9 = unifiedJedis.tdigestByRank("racer_ages", 4);
System.out.println(res9); // >>> [42.63]
double res10 = unifiedJedis.tdigestMin("racer_ages");
System.out.println(res10); // >>> 19.27
double res11 = unifiedJedis.tdigestMax("racer_ages");
System.out.println(res11); // >>> 85.71
String res12 = unifiedJedis.tdigestReset("racer_ages");
System.out.println(res12); // >>> OK
}
}
import redis
r = redis.Redis(decode_responses=True)
res1 = r.tdigest().create("bikes:sales", 100)
print(res1) # >>> True
res2 = r.tdigest().add("bikes:sales", [21])
print(res2) # >>> OK
res3 = r.tdigest().add("bikes:sales", [150, 95, 75, 34])
print(res3) # >>> OK
res4 = r.tdigest().create("racer_ages")
print(res4) # >>> True
res5 = r.tdigest().add(
"racer_ages",
[
45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63,
],
)
print(res5) # >>> OK
res6 = r.tdigest().rank("racer_ages", 50)
print(res6) # >>> [7]
res7 = r.tdigest().rank("racer_ages", 50, 40)
print(res7) # >>> [7, 4]
res8 = r.tdigest().quantile("racer_ages", 0.5)
print(res8) # >>> [44.2]
res9 = r.tdigest().byrank("racer_ages", 4)
print(res9) # >>> [42.63]
res10 = r.tdigest().min("racer_ages")
print(res10) # >>> 19.27
res11 = r.tdigest().max("racer_ages")
print(res11) # >>> 85.71
res12 = r.tdigest().reset("racer_ages")
print(res12) # >>> OK
最后,TDIGEST.REVRANK key value...
与 TDIGEST.RANK 类似,但会为每个输入值返回对(大于给定值 + 等于给定值的一半观测值)的观测值数量的估计。
按分数或等级估计值
TDIGEST.QUANTILE key fraction...
为每个输入分数返回对(小于给定观测值分数的给定值(浮点数))的估计。TDIGEST.BYRANK key rank...
为每个输入等级返回对(具有该等级的给定值(浮点数))的估计。
> TDIGEST.QUANTILE racer_ages .5
1) "44.200000000000003"
> TDIGEST.BYRANK racer_ages 4
1) "42.630000000000003"
package io.redis.examples;
public class TDigestExample {
public void run(){
UnifiedJedis unifiedJedis = new UnifiedJedis("redis://127.0.0.1:6379");
String res1 = unifiedJedis.tdigestCreate("bikes:sales", 100);
System.out.println(res1); // >>> True
String res2 = unifiedJedis.tdigestAdd("bikes:sales", 21);
System.out.println(res2); // >>> OK
String res3 = unifiedJedis.tdigestAdd("bikes:sales", 150, 95, 75, 34);
System.out.println(res3); // >>> OK
String res4 = unifiedJedis.tdigestCreate("racer_ages");
System.out.println(res4); // >>> True
String res5 = unifiedJedis.tdigestAdd("racer_ages", 45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63);
System.out.println(res5); // >>> OK
List<Long> res6 = unifiedJedis.tdigestRank("racer_ages", 50);
System.out.println(res6); // >>> [7]
List<Long> res7 = unifiedJedis.tdigestRank("racer_ages", 50, 40);
System.out.println(res7); // >>> [7, 4]
List<Double> res8 = unifiedJedis.tdigestQuantile("racer_ages", 0.5);
System.out.println(res8); // >>> [44.2]
List<Double> res9 = unifiedJedis.tdigestByRank("racer_ages", 4);
System.out.println(res9); // >>> [42.63]
double res10 = unifiedJedis.tdigestMin("racer_ages");
System.out.println(res10); // >>> 19.27
double res11 = unifiedJedis.tdigestMax("racer_ages");
System.out.println(res11); // >>> 85.71
String res12 = unifiedJedis.tdigestReset("racer_ages");
System.out.println(res12); // >>> OK
}
}
import redis
r = redis.Redis(decode_responses=True)
res1 = r.tdigest().create("bikes:sales", 100)
print(res1) # >>> True
res2 = r.tdigest().add("bikes:sales", [21])
print(res2) # >>> OK
res3 = r.tdigest().add("bikes:sales", [150, 95, 75, 34])
print(res3) # >>> OK
res4 = r.tdigest().create("racer_ages")
print(res4) # >>> True
res5 = r.tdigest().add(
"racer_ages",
[
45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63,
],
)
print(res5) # >>> OK
res6 = r.tdigest().rank("racer_ages", 50)
print(res6) # >>> [7]
res7 = r.tdigest().rank("racer_ages", 50, 40)
print(res7) # >>> [7, 4]
res8 = r.tdigest().quantile("racer_ages", 0.5)
print(res8) # >>> [44.2]
res9 = r.tdigest().byrank("racer_ages", 4)
print(res9) # >>> [42.63]
res10 = r.tdigest().min("racer_ages")
print(res10) # >>> 19.27
res11 = r.tdigest().max("racer_ages")
print(res11) # >>> 85.71
res12 = r.tdigest().reset("racer_ages")
print(res12) # >>> OK
TDIGEST.BYREVRANK key rank...
为每个输入反向等级返回对(具有该反向等级的值(浮点数))的估计。
估计修剪后的平均值
使用 TDIGEST.TRIMMED_MEAN key lowFraction highFraction
检索对指定分数之间的平均值的估计。
这对于计算忽略异常值的平均值特别有用。例如 - 计算第 20 个百分位和第 80 个百分位之间的平均值。
合并草图
有时合并草图很有用。例如,假设我们测量了 3 台服务器的延迟,并且我们想要计算所有服务器组合的 90%、95% 和 99% 延迟。
TDIGEST.MERGE destKey numKeys sourceKey... [COMPRESSION compression] [OVERRIDE]
将多个草图合并成一个草图。
如果 destKey
不存在 - 将创建一个新草图。
如果 destKey
是一个现有的草图,它的值将与源键的值合并。要覆盖目标键内容,请使用 OVERRIDE
。
使用 TDIGEST.MIN
和 TDIGEST.MAX
分别检索草图中的最小值和最大值。
> TDIGEST.MIN racer_ages
"19.27"
> TDIGEST.MAX racer_ages
"85.709999999999994"
package io.redis.examples;
public class TDigestExample {
public void run(){
UnifiedJedis unifiedJedis = new UnifiedJedis("redis://127.0.0.1:6379");
String res1 = unifiedJedis.tdigestCreate("bikes:sales", 100);
System.out.println(res1); // >>> True
String res2 = unifiedJedis.tdigestAdd("bikes:sales", 21);
System.out.println(res2); // >>> OK
String res3 = unifiedJedis.tdigestAdd("bikes:sales", 150, 95, 75, 34);
System.out.println(res3); // >>> OK
String res4 = unifiedJedis.tdigestCreate("racer_ages");
System.out.println(res4); // >>> True
String res5 = unifiedJedis.tdigestAdd("racer_ages", 45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63);
System.out.println(res5); // >>> OK
List<Long> res6 = unifiedJedis.tdigestRank("racer_ages", 50);
System.out.println(res6); // >>> [7]
List<Long> res7 = unifiedJedis.tdigestRank("racer_ages", 50, 40);
System.out.println(res7); // >>> [7, 4]
List<Double> res8 = unifiedJedis.tdigestQuantile("racer_ages", 0.5);
System.out.println(res8); // >>> [44.2]
List<Double> res9 = unifiedJedis.tdigestByRank("racer_ages", 4);
System.out.println(res9); // >>> [42.63]
double res10 = unifiedJedis.tdigestMin("racer_ages");
System.out.println(res10); // >>> 19.27
double res11 = unifiedJedis.tdigestMax("racer_ages");
System.out.println(res11); // >>> 85.71
String res12 = unifiedJedis.tdigestReset("racer_ages");
System.out.println(res12); // >>> OK
}
}
import redis
r = redis.Redis(decode_responses=True)
res1 = r.tdigest().create("bikes:sales", 100)
print(res1) # >>> True
res2 = r.tdigest().add("bikes:sales", [21])
print(res2) # >>> OK
res3 = r.tdigest().add("bikes:sales", [150, 95, 75, 34])
print(res3) # >>> OK
res4 = r.tdigest().create("racer_ages")
print(res4) # >>> True
res5 = r.tdigest().add(
"racer_ages",
[
45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63,
],
)
print(res5) # >>> OK
res6 = r.tdigest().rank("racer_ages", 50)
print(res6) # >>> [7]
res7 = r.tdigest().rank("racer_ages", 50, 40)
print(res7) # >>> [7, 4]
res8 = r.tdigest().quantile("racer_ages", 0.5)
print(res8) # >>> [44.2]
res9 = r.tdigest().byrank("racer_ages", 4)
print(res9) # >>> [42.63]
res10 = r.tdigest().min("racer_ages")
print(res10) # >>> 19.27
res11 = r.tdigest().max("racer_ages")
print(res11) # >>> 85.71
res12 = r.tdigest().reset("racer_ages")
print(res12) # >>> OK
当草图为空时,两者都返回 nan
。
这两个命令都返回准确的结果,分别等效于 TDIGEST.BYRANK racer_ages 0
和 TDIGEST.BYREVRANK racer_ages 0
。
使用 TDIGEST.INFO racer_ages
检索有关草图的一些其他信息。
重置草图
> TDIGEST.RESET racer_ages
OK
package io.redis.examples;
public class TDigestExample {
public void run(){
UnifiedJedis unifiedJedis = new UnifiedJedis("redis://127.0.0.1:6379");
String res1 = unifiedJedis.tdigestCreate("bikes:sales", 100);
System.out.println(res1); // >>> True
String res2 = unifiedJedis.tdigestAdd("bikes:sales", 21);
System.out.println(res2); // >>> OK
String res3 = unifiedJedis.tdigestAdd("bikes:sales", 150, 95, 75, 34);
System.out.println(res3); // >>> OK
String res4 = unifiedJedis.tdigestCreate("racer_ages");
System.out.println(res4); // >>> True
String res5 = unifiedJedis.tdigestAdd("racer_ages", 45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63);
System.out.println(res5); // >>> OK
List<Long> res6 = unifiedJedis.tdigestRank("racer_ages", 50);
System.out.println(res6); // >>> [7]
List<Long> res7 = unifiedJedis.tdigestRank("racer_ages", 50, 40);
System.out.println(res7); // >>> [7, 4]
List<Double> res8 = unifiedJedis.tdigestQuantile("racer_ages", 0.5);
System.out.println(res8); // >>> [44.2]
List<Double> res9 = unifiedJedis.tdigestByRank("racer_ages", 4);
System.out.println(res9); // >>> [42.63]
double res10 = unifiedJedis.tdigestMin("racer_ages");
System.out.println(res10); // >>> 19.27
double res11 = unifiedJedis.tdigestMax("racer_ages");
System.out.println(res11); // >>> 85.71
String res12 = unifiedJedis.tdigestReset("racer_ages");
System.out.println(res12); // >>> OK
}
}
import redis
r = redis.Redis(decode_responses=True)
res1 = r.tdigest().create("bikes:sales", 100)
print(res1) # >>> True
res2 = r.tdigest().add("bikes:sales", [21])
print(res2) # >>> OK
res3 = r.tdigest().add("bikes:sales", [150, 95, 75, 34])
print(res3) # >>> OK
res4 = r.tdigest().create("racer_ages")
print(res4) # >>> True
res5 = r.tdigest().add(
"racer_ages",
[
45.88,
44.2,
58.03,
19.76,
39.84,
69.28,
50.97,
25.41,
19.27,
85.71,
42.63,
],
)
print(res5) # >>> OK
res6 = r.tdigest().rank("racer_ages", 50)
print(res6) # >>> [7]
res7 = r.tdigest().rank("racer_ages", 50, 40)
print(res7) # >>> [7, 4]
res8 = r.tdigest().quantile("racer_ages", 0.5)
print(res8) # >>> [44.2]
res9 = r.tdigest().byrank("racer_ages", 4)
print(res9) # >>> [42.63]
res10 = r.tdigest().min("racer_ages")
print(res10) # >>> 19.27
res11 = r.tdigest().max("racer_ages")
print(res11) # >>> 85.71
res12 = r.tdigest().reset("racer_ages")
print(res12) # >>> OK
学术资料
参考