1 #include <linux/rcupdate.h>
2 #include <linux/spinlock.h>
3 #include <linux/jiffies.h>
4 #include <linux/bootmem.h>
5 #include <linux/module.h>
6 #include <linux/cache.h>
7 #include <linux/slab.h>
8 #include <linux/init.h>
10 #include <linux/hash.h>
12 #include <net/inet_connection_sock.h>
13 #include <net/net_namespace.h>
14 #include <net/request_sock.h>
15 #include <net/inetpeer.h>
21 int sysctl_tcp_nometrics_save __read_mostly
;
23 enum tcp_metric_index
{
28 TCP_METRIC_REORDERING
,
34 struct tcp_fastopen_metrics
{
36 u16 syn_loss
:10; /* Recurring Fast Open SYN losses */
37 unsigned long last_syn_loss
; /* Last Fast Open SYN loss */
38 struct tcp_fastopen_cookie cookie
;
41 struct tcp_metrics_block
{
42 struct tcp_metrics_block __rcu
*tcpm_next
;
43 struct inetpeer_addr tcpm_addr
;
44 unsigned long tcpm_stamp
;
48 u32 tcpm_vals
[TCP_METRIC_MAX
];
49 struct tcp_fastopen_metrics tcpm_fastopen
;
52 static bool tcp_metric_locked(struct tcp_metrics_block
*tm
,
53 enum tcp_metric_index idx
)
55 return tm
->tcpm_lock
& (1 << idx
);
58 static u32
tcp_metric_get(struct tcp_metrics_block
*tm
,
59 enum tcp_metric_index idx
)
61 return tm
->tcpm_vals
[idx
];
64 static u32
tcp_metric_get_jiffies(struct tcp_metrics_block
*tm
,
65 enum tcp_metric_index idx
)
67 return msecs_to_jiffies(tm
->tcpm_vals
[idx
]);
70 static void tcp_metric_set(struct tcp_metrics_block
*tm
,
71 enum tcp_metric_index idx
,
74 tm
->tcpm_vals
[idx
] = val
;
77 static void tcp_metric_set_msecs(struct tcp_metrics_block
*tm
,
78 enum tcp_metric_index idx
,
81 tm
->tcpm_vals
[idx
] = jiffies_to_msecs(val
);
84 static bool addr_same(const struct inetpeer_addr
*a
,
85 const struct inetpeer_addr
*b
)
87 const struct in6_addr
*a6
, *b6
;
89 if (a
->family
!= b
->family
)
91 if (a
->family
== AF_INET
)
92 return a
->addr
.a4
== b
->addr
.a4
;
94 a6
= (const struct in6_addr
*) &a
->addr
.a6
[0];
95 b6
= (const struct in6_addr
*) &b
->addr
.a6
[0];
97 return ipv6_addr_equal(a6
, b6
);
100 struct tcpm_hash_bucket
{
101 struct tcp_metrics_block __rcu
*chain
;
104 static DEFINE_SPINLOCK(tcp_metrics_lock
);
106 static void tcpm_suck_dst(struct tcp_metrics_block
*tm
, struct dst_entry
*dst
)
111 if (dst_metric_locked(dst
, RTAX_RTT
))
112 val
|= 1 << TCP_METRIC_RTT
;
113 if (dst_metric_locked(dst
, RTAX_RTTVAR
))
114 val
|= 1 << TCP_METRIC_RTTVAR
;
115 if (dst_metric_locked(dst
, RTAX_SSTHRESH
))
116 val
|= 1 << TCP_METRIC_SSTHRESH
;
117 if (dst_metric_locked(dst
, RTAX_CWND
))
118 val
|= 1 << TCP_METRIC_CWND
;
119 if (dst_metric_locked(dst
, RTAX_REORDERING
))
120 val
|= 1 << TCP_METRIC_REORDERING
;
123 tm
->tcpm_vals
[TCP_METRIC_RTT
] = dst_metric_raw(dst
, RTAX_RTT
);
124 tm
->tcpm_vals
[TCP_METRIC_RTTVAR
] = dst_metric_raw(dst
, RTAX_RTTVAR
);
125 tm
->tcpm_vals
[TCP_METRIC_SSTHRESH
] = dst_metric_raw(dst
, RTAX_SSTHRESH
);
126 tm
->tcpm_vals
[TCP_METRIC_CWND
] = dst_metric_raw(dst
, RTAX_CWND
);
127 tm
->tcpm_vals
[TCP_METRIC_REORDERING
] = dst_metric_raw(dst
, RTAX_REORDERING
);
129 tm
->tcpm_ts_stamp
= 0;
130 tm
->tcpm_fastopen
.mss
= 0;
131 tm
->tcpm_fastopen
.syn_loss
= 0;
132 tm
->tcpm_fastopen
.cookie
.len
= 0;
135 static struct tcp_metrics_block
*tcpm_new(struct dst_entry
*dst
,
136 struct inetpeer_addr
*addr
,
140 struct tcp_metrics_block
*tm
;
143 spin_lock_bh(&tcp_metrics_lock
);
144 net
= dev_net(dst
->dev
);
145 if (unlikely(reclaim
)) {
146 struct tcp_metrics_block
*oldest
;
148 oldest
= rcu_dereference(net
->ipv4
.tcp_metrics_hash
[hash
].chain
);
149 for (tm
= rcu_dereference(oldest
->tcpm_next
); tm
;
150 tm
= rcu_dereference(tm
->tcpm_next
)) {
151 if (time_before(tm
->tcpm_stamp
, oldest
->tcpm_stamp
))
156 tm
= kmalloc(sizeof(*tm
), GFP_ATOMIC
);
160 tm
->tcpm_addr
= *addr
;
161 tm
->tcpm_stamp
= jiffies
;
163 tcpm_suck_dst(tm
, dst
);
165 if (likely(!reclaim
)) {
166 tm
->tcpm_next
= net
->ipv4
.tcp_metrics_hash
[hash
].chain
;
167 rcu_assign_pointer(net
->ipv4
.tcp_metrics_hash
[hash
].chain
, tm
);
171 spin_unlock_bh(&tcp_metrics_lock
);
175 #define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
177 static void tcpm_check_stamp(struct tcp_metrics_block
*tm
, struct dst_entry
*dst
)
179 if (tm
&& unlikely(time_after(jiffies
, tm
->tcpm_stamp
+ TCP_METRICS_TIMEOUT
)))
180 tcpm_suck_dst(tm
, dst
);
183 #define TCP_METRICS_RECLAIM_DEPTH 5
184 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
186 static struct tcp_metrics_block
*tcp_get_encode(struct tcp_metrics_block
*tm
, int depth
)
190 if (depth
> TCP_METRICS_RECLAIM_DEPTH
)
191 return TCP_METRICS_RECLAIM_PTR
;
195 static struct tcp_metrics_block
*__tcp_get_metrics(const struct inetpeer_addr
*addr
,
196 struct net
*net
, unsigned int hash
)
198 struct tcp_metrics_block
*tm
;
201 for (tm
= rcu_dereference(net
->ipv4
.tcp_metrics_hash
[hash
].chain
); tm
;
202 tm
= rcu_dereference(tm
->tcpm_next
)) {
203 if (addr_same(&tm
->tcpm_addr
, addr
))
207 return tcp_get_encode(tm
, depth
);
210 static struct tcp_metrics_block
*__tcp_get_metrics_req(struct request_sock
*req
,
211 struct dst_entry
*dst
)
213 struct tcp_metrics_block
*tm
;
214 struct inetpeer_addr addr
;
218 addr
.family
= req
->rsk_ops
->family
;
219 switch (addr
.family
) {
221 addr
.addr
.a4
= inet_rsk(req
)->rmt_addr
;
222 hash
= (__force
unsigned int) addr
.addr
.a4
;
225 *(struct in6_addr
*)addr
.addr
.a6
= inet6_rsk(req
)->rmt_addr
;
226 hash
= ipv6_addr_hash(&inet6_rsk(req
)->rmt_addr
);
232 net
= dev_net(dst
->dev
);
233 hash
= hash_32(hash
, net
->ipv4
.tcp_metrics_hash_log
);
235 for (tm
= rcu_dereference(net
->ipv4
.tcp_metrics_hash
[hash
].chain
); tm
;
236 tm
= rcu_dereference(tm
->tcpm_next
)) {
237 if (addr_same(&tm
->tcpm_addr
, &addr
))
240 tcpm_check_stamp(tm
, dst
);
244 static struct tcp_metrics_block
*__tcp_get_metrics_tw(struct inet_timewait_sock
*tw
)
246 struct inet6_timewait_sock
*tw6
;
247 struct tcp_metrics_block
*tm
;
248 struct inetpeer_addr addr
;
252 addr
.family
= tw
->tw_family
;
253 switch (addr
.family
) {
255 addr
.addr
.a4
= tw
->tw_daddr
;
256 hash
= (__force
unsigned int) addr
.addr
.a4
;
259 tw6
= inet6_twsk((struct sock
*)tw
);
260 *(struct in6_addr
*)addr
.addr
.a6
= tw6
->tw_v6_daddr
;
261 hash
= ipv6_addr_hash(&tw6
->tw_v6_daddr
);
268 hash
= hash_32(hash
, net
->ipv4
.tcp_metrics_hash_log
);
270 for (tm
= rcu_dereference(net
->ipv4
.tcp_metrics_hash
[hash
].chain
); tm
;
271 tm
= rcu_dereference(tm
->tcpm_next
)) {
272 if (addr_same(&tm
->tcpm_addr
, &addr
))
278 static struct tcp_metrics_block
*tcp_get_metrics(struct sock
*sk
,
279 struct dst_entry
*dst
,
282 struct tcp_metrics_block
*tm
;
283 struct inetpeer_addr addr
;
288 addr
.family
= sk
->sk_family
;
289 switch (addr
.family
) {
291 addr
.addr
.a4
= inet_sk(sk
)->inet_daddr
;
292 hash
= (__force
unsigned int) addr
.addr
.a4
;
295 *(struct in6_addr
*)addr
.addr
.a6
= inet6_sk(sk
)->daddr
;
296 hash
= ipv6_addr_hash(&inet6_sk(sk
)->daddr
);
302 net
= dev_net(dst
->dev
);
303 hash
= hash_32(hash
, net
->ipv4
.tcp_metrics_hash_log
);
305 tm
= __tcp_get_metrics(&addr
, net
, hash
);
307 if (tm
== TCP_METRICS_RECLAIM_PTR
) {
312 tm
= tcpm_new(dst
, &addr
, hash
, reclaim
);
314 tcpm_check_stamp(tm
, dst
);
319 /* Save metrics learned by this TCP session. This function is called
320 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
321 * or goes from LAST-ACK to CLOSE.
323 void tcp_update_metrics(struct sock
*sk
)
325 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
326 struct dst_entry
*dst
= __sk_dst_get(sk
);
327 struct tcp_sock
*tp
= tcp_sk(sk
);
328 struct tcp_metrics_block
*tm
;
333 if (sysctl_tcp_nometrics_save
|| !dst
)
336 if (dst
->flags
& DST_HOST
)
340 if (icsk
->icsk_backoff
|| !tp
->srtt
) {
341 /* This session failed to estimate rtt. Why?
342 * Probably, no packets returned in time. Reset our
345 tm
= tcp_get_metrics(sk
, dst
, false);
346 if (tm
&& !tcp_metric_locked(tm
, TCP_METRIC_RTT
))
347 tcp_metric_set(tm
, TCP_METRIC_RTT
, 0);
350 tm
= tcp_get_metrics(sk
, dst
, true);
355 rtt
= tcp_metric_get_jiffies(tm
, TCP_METRIC_RTT
);
358 /* If newly calculated rtt larger than stored one, store new
359 * one. Otherwise, use EWMA. Remember, rtt overestimation is
360 * always better than underestimation.
362 if (!tcp_metric_locked(tm
, TCP_METRIC_RTT
)) {
367 tcp_metric_set_msecs(tm
, TCP_METRIC_RTT
, rtt
);
370 if (!tcp_metric_locked(tm
, TCP_METRIC_RTTVAR
)) {
376 /* Scale deviation to rttvar fixed point */
381 var
= tcp_metric_get_jiffies(tm
, TCP_METRIC_RTTVAR
);
385 var
-= (var
- m
) >> 2;
387 tcp_metric_set_msecs(tm
, TCP_METRIC_RTTVAR
, var
);
390 if (tcp_in_initial_slowstart(tp
)) {
391 /* Slow start still did not finish. */
392 if (!tcp_metric_locked(tm
, TCP_METRIC_SSTHRESH
)) {
393 val
= tcp_metric_get(tm
, TCP_METRIC_SSTHRESH
);
394 if (val
&& (tp
->snd_cwnd
>> 1) > val
)
395 tcp_metric_set(tm
, TCP_METRIC_SSTHRESH
,
398 if (!tcp_metric_locked(tm
, TCP_METRIC_CWND
)) {
399 val
= tcp_metric_get(tm
, TCP_METRIC_CWND
);
400 if (tp
->snd_cwnd
> val
)
401 tcp_metric_set(tm
, TCP_METRIC_CWND
,
404 } else if (tp
->snd_cwnd
> tp
->snd_ssthresh
&&
405 icsk
->icsk_ca_state
== TCP_CA_Open
) {
406 /* Cong. avoidance phase, cwnd is reliable. */
407 if (!tcp_metric_locked(tm
, TCP_METRIC_SSTHRESH
))
408 tcp_metric_set(tm
, TCP_METRIC_SSTHRESH
,
409 max(tp
->snd_cwnd
>> 1, tp
->snd_ssthresh
));
410 if (!tcp_metric_locked(tm
, TCP_METRIC_CWND
)) {
411 val
= tcp_metric_get(tm
, TCP_METRIC_CWND
);
412 tcp_metric_set(tm
, TCP_METRIC_CWND
, (val
+ tp
->snd_cwnd
) >> 1);
415 /* Else slow start did not finish, cwnd is non-sense,
416 * ssthresh may be also invalid.
418 if (!tcp_metric_locked(tm
, TCP_METRIC_CWND
)) {
419 val
= tcp_metric_get(tm
, TCP_METRIC_CWND
);
420 tcp_metric_set(tm
, TCP_METRIC_CWND
,
421 (val
+ tp
->snd_ssthresh
) >> 1);
423 if (!tcp_metric_locked(tm
, TCP_METRIC_SSTHRESH
)) {
424 val
= tcp_metric_get(tm
, TCP_METRIC_SSTHRESH
);
425 if (val
&& tp
->snd_ssthresh
> val
)
426 tcp_metric_set(tm
, TCP_METRIC_SSTHRESH
,
429 if (!tcp_metric_locked(tm
, TCP_METRIC_REORDERING
)) {
430 val
= tcp_metric_get(tm
, TCP_METRIC_REORDERING
);
431 if (val
< tp
->reordering
&&
432 tp
->reordering
!= sysctl_tcp_reordering
)
433 tcp_metric_set(tm
, TCP_METRIC_REORDERING
,
437 tm
->tcpm_stamp
= jiffies
;
442 /* Initialize metrics on socket. */
444 void tcp_init_metrics(struct sock
*sk
)
446 struct dst_entry
*dst
= __sk_dst_get(sk
);
447 struct tcp_sock
*tp
= tcp_sk(sk
);
448 struct tcp_metrics_block
*tm
;
457 tm
= tcp_get_metrics(sk
, dst
, true);
463 if (tcp_metric_locked(tm
, TCP_METRIC_CWND
))
464 tp
->snd_cwnd_clamp
= tcp_metric_get(tm
, TCP_METRIC_CWND
);
466 val
= tcp_metric_get(tm
, TCP_METRIC_SSTHRESH
);
468 tp
->snd_ssthresh
= val
;
469 if (tp
->snd_ssthresh
> tp
->snd_cwnd_clamp
)
470 tp
->snd_ssthresh
= tp
->snd_cwnd_clamp
;
472 /* ssthresh may have been reduced unnecessarily during.
473 * 3WHS. Restore it back to its initial default.
475 tp
->snd_ssthresh
= TCP_INFINITE_SSTHRESH
;
477 val
= tcp_metric_get(tm
, TCP_METRIC_REORDERING
);
478 if (val
&& tp
->reordering
!= val
) {
479 tcp_disable_fack(tp
);
480 tcp_disable_early_retrans(tp
);
481 tp
->reordering
= val
;
484 val
= tcp_metric_get(tm
, TCP_METRIC_RTT
);
485 if (val
== 0 || tp
->srtt
== 0) {
489 /* Initial rtt is determined from SYN,SYN-ACK.
490 * The segment is small and rtt may appear much
491 * less than real one. Use per-dst memory
492 * to make it more realistic.
494 * A bit of theory. RTT is time passed after "normal" sized packet
495 * is sent until it is ACKed. In normal circumstances sending small
496 * packets force peer to delay ACKs and calculation is correct too.
497 * The algorithm is adaptive and, provided we follow specs, it
498 * NEVER underestimate RTT. BUT! If peer tries to make some clever
499 * tricks sort of "quick acks" for time long enough to decrease RTT
500 * to low value, and then abruptly stops to do it and starts to delay
501 * ACKs, wait for troubles.
503 val
= msecs_to_jiffies(val
);
504 if (val
> tp
->srtt
) {
506 tp
->rtt_seq
= tp
->snd_nxt
;
508 val
= tcp_metric_get_jiffies(tm
, TCP_METRIC_RTTVAR
);
509 if (val
> tp
->mdev
) {
511 tp
->mdev_max
= tp
->rttvar
= max(tp
->mdev
, tcp_rto_min(sk
));
518 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
519 * 3WHS. This is most likely due to retransmission,
520 * including spurious one. Reset the RTO back to 3secs
521 * from the more aggressive 1sec to avoid more spurious
524 tp
->mdev
= tp
->mdev_max
= tp
->rttvar
= TCP_TIMEOUT_FALLBACK
;
525 inet_csk(sk
)->icsk_rto
= TCP_TIMEOUT_FALLBACK
;
527 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
528 * retransmitted. In light of RFC6298 more aggressive 1sec
529 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
530 * retransmission has occurred.
532 if (tp
->total_retrans
> 1)
535 tp
->snd_cwnd
= tcp_init_cwnd(tp
, dst
);
536 tp
->snd_cwnd_stamp
= tcp_time_stamp
;
539 bool tcp_peer_is_proven(struct request_sock
*req
, struct dst_entry
*dst
, bool paws_check
)
541 struct tcp_metrics_block
*tm
;
548 tm
= __tcp_get_metrics_req(req
, dst
);
551 (u32
)get_seconds() - tm
->tcpm_ts_stamp
< TCP_PAWS_MSL
&&
552 (s32
)(tm
->tcpm_ts
- req
->ts_recent
) > TCP_PAWS_WINDOW
)
557 if (tm
&& tcp_metric_get(tm
, TCP_METRIC_RTT
) && tm
->tcpm_ts_stamp
)
566 EXPORT_SYMBOL_GPL(tcp_peer_is_proven
);
568 void tcp_fetch_timewait_stamp(struct sock
*sk
, struct dst_entry
*dst
)
570 struct tcp_metrics_block
*tm
;
573 tm
= tcp_get_metrics(sk
, dst
, true);
575 struct tcp_sock
*tp
= tcp_sk(sk
);
577 if ((u32
)get_seconds() - tm
->tcpm_ts_stamp
<= TCP_PAWS_MSL
) {
578 tp
->rx_opt
.ts_recent_stamp
= tm
->tcpm_ts_stamp
;
579 tp
->rx_opt
.ts_recent
= tm
->tcpm_ts
;
584 EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp
);
586 /* VJ's idea. Save last timestamp seen from this destination and hold
587 * it at least for normal timewait interval to use for duplicate
588 * segment detection in subsequent connections, before they enter
589 * synchronized state.
591 bool tcp_remember_stamp(struct sock
*sk
)
593 struct dst_entry
*dst
= __sk_dst_get(sk
);
597 struct tcp_metrics_block
*tm
;
600 tm
= tcp_get_metrics(sk
, dst
, true);
602 struct tcp_sock
*tp
= tcp_sk(sk
);
604 if ((s32
)(tm
->tcpm_ts
- tp
->rx_opt
.ts_recent
) <= 0 ||
605 ((u32
)get_seconds() - tm
->tcpm_ts_stamp
> TCP_PAWS_MSL
&&
606 tm
->tcpm_ts_stamp
<= (u32
)tp
->rx_opt
.ts_recent_stamp
)) {
607 tm
->tcpm_ts_stamp
= (u32
)tp
->rx_opt
.ts_recent_stamp
;
608 tm
->tcpm_ts
= tp
->rx_opt
.ts_recent
;
617 bool tcp_tw_remember_stamp(struct inet_timewait_sock
*tw
)
619 struct tcp_metrics_block
*tm
;
623 tm
= __tcp_get_metrics_tw(tw
);
625 const struct tcp_timewait_sock
*tcptw
;
626 struct sock
*sk
= (struct sock
*) tw
;
628 tcptw
= tcp_twsk(sk
);
629 if ((s32
)(tm
->tcpm_ts
- tcptw
->tw_ts_recent
) <= 0 ||
630 ((u32
)get_seconds() - tm
->tcpm_ts_stamp
> TCP_PAWS_MSL
&&
631 tm
->tcpm_ts_stamp
<= (u32
)tcptw
->tw_ts_recent_stamp
)) {
632 tm
->tcpm_ts_stamp
= (u32
)tcptw
->tw_ts_recent_stamp
;
633 tm
->tcpm_ts
= tcptw
->tw_ts_recent
;
642 static DEFINE_SEQLOCK(fastopen_seqlock
);
644 void tcp_fastopen_cache_get(struct sock
*sk
, u16
*mss
,
645 struct tcp_fastopen_cookie
*cookie
,
646 int *syn_loss
, unsigned long *last_syn_loss
)
648 struct tcp_metrics_block
*tm
;
651 tm
= tcp_get_metrics(sk
, __sk_dst_get(sk
), false);
653 struct tcp_fastopen_metrics
*tfom
= &tm
->tcpm_fastopen
;
657 seq
= read_seqbegin(&fastopen_seqlock
);
660 *cookie
= tfom
->cookie
;
661 *syn_loss
= tfom
->syn_loss
;
662 *last_syn_loss
= *syn_loss
? tfom
->last_syn_loss
: 0;
663 } while (read_seqretry(&fastopen_seqlock
, seq
));
668 void tcp_fastopen_cache_set(struct sock
*sk
, u16 mss
,
669 struct tcp_fastopen_cookie
*cookie
, bool syn_lost
)
671 struct tcp_metrics_block
*tm
;
674 tm
= tcp_get_metrics(sk
, __sk_dst_get(sk
), true);
676 struct tcp_fastopen_metrics
*tfom
= &tm
->tcpm_fastopen
;
678 write_seqlock_bh(&fastopen_seqlock
);
681 tfom
->cookie
= *cookie
;
684 tfom
->last_syn_loss
= jiffies
;
687 write_sequnlock_bh(&fastopen_seqlock
);
692 static unsigned int tcpmhash_entries
;
693 static int __init
set_tcpmhash_entries(char *str
)
700 ret
= kstrtouint(str
, 0, &tcpmhash_entries
);
706 __setup("tcpmhash_entries=", set_tcpmhash_entries
);
708 static int __net_init
tcp_net_metrics_init(struct net
*net
)
713 slots
= tcpmhash_entries
;
715 if (totalram_pages
>= 128 * 1024)
721 net
->ipv4
.tcp_metrics_hash_log
= order_base_2(slots
);
722 size
= sizeof(struct tcpm_hash_bucket
) << net
->ipv4
.tcp_metrics_hash_log
;
724 net
->ipv4
.tcp_metrics_hash
= kzalloc(size
, GFP_KERNEL
);
725 if (!net
->ipv4
.tcp_metrics_hash
)
731 static void __net_exit
tcp_net_metrics_exit(struct net
*net
)
733 kfree(net
->ipv4
.tcp_metrics_hash
);
736 static __net_initdata
struct pernet_operations tcp_net_metrics_ops
= {
737 .init
= tcp_net_metrics_init
,
738 .exit
= tcp_net_metrics_exit
,
741 void __init
tcp_metrics_init(void)
743 register_pernet_subsys(&tcp_net_metrics_ops
);