net/netfilter/ipvs/ip_vs_lblc.c

   1 /*
   2  * IPVS:        Locality-Based Least-Connection scheduling module
   3  *
   4  * Authors:     Wensong Zhang <wensong@gnuchina.org>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Changes:
  12  *     Martin Hamilton         :    fixed the terrible locking bugs
  13  *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
  14  *     Wensong Zhang           :    fixed the uninitialized tbl->lock bug
  15  *     Wensong Zhang           :    added doing full expiration check to
  16  *                                   collect stale entries of 24+ hours when
  17  *                                   no partial expire check in a half hour
  18  *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
  19  *                                   to avoid the possible race between timer
  20  *                                   handler and del_timer thread in SMP
  21  *
  22  */
  23
  24 /*
  25  * The lblc algorithm is as follows (pseudo code):
  26  *
  27  *       if cachenode[dest_ip] is null then
  28  *               n, cachenode[dest_ip] <- {weighted least-conn node};
  29  *       else
  30  *               n <- cachenode[dest_ip];
  31  *               if (n is dead) OR
  32  *                  (n.conns>n.weight AND
  33  *                   there is a node m with m.conns<m.weight/2) then
  34  *                 n, cachenode[dest_ip] <- {weighted least-conn node};
  35  *
  36  *       return n;
  37  *
  38  * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
  39  * me to write this module.
  40  */
  41
  42 #define KMSG_COMPONENT "IPVS"
  43 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  44
  45 #include <linux/ip.h>
  46 #include <linux/slab.h>
  47 #include <linux/module.h>
  48 #include <linux/kernel.h>
  49 #include <linux/skbuff.h>
  50 #include <linux/jiffies.h>
  51
  52 /* for sysctl */
  53 #include <linux/fs.h>
  54 #include <linux/sysctl.h>
  55
  56 #include <net/ip_vs.h>
  57
  58
  59 /*
  60  *    It is for garbage collection of stale IPVS lblc entries,
  61  *    when the table is full.
  62  */
  63 #define CHECK_EXPIRE_INTERVAL   (60*HZ)
  64 #define ENTRY_TIMEOUT           (6*60*HZ)
  65
  66 #define DEFAULT_EXPIRATION      (24*60*60*HZ)
  67
  68 /*
  69  *    It is for full expiration check.
  70  *    When there is no partial expiration check (garbage collection)
  71  *    in a half hour, do a full expiration check to collect stale
  72  *    entries that haven't been touched for a day.
  73  */
  74 #define COUNT_FOR_FULL_EXPIRATION   30
  75
  76
  77 /*
  78  *     for IPVS lblc entry hash table
  79  */
  80 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS
  81 #define CONFIG_IP_VS_LBLC_TAB_BITS      10
  82 #endif
  83 #define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
  84 #define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
  85 #define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
  86
  87
  88 /*
  89  *      IPVS lblc entry represents an association between destination
  90  *      IP address and its destination server
  91  */
  92 struct ip_vs_lblc_entry {
  93         struct hlist_node       list;
  94         int                     af;             /* address family */
  95         union nf_inet_addr      addr;           /* destination IP address */
  96         struct ip_vs_dest       *dest;          /* real server (cache) */
  97         unsigned long           lastuse;        /* last used time */
  98         struct rcu_head         rcu_head;
  99 };
 100
 101
 102 /*
 103  *      IPVS lblc hash table
 104  */
 105 struct ip_vs_lblc_table {
 106         struct rcu_head         rcu_head;
 107         struct hlist_head       bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
 108         struct timer_list       periodic_timer; /* collect stale entries */
 109         atomic_t                entries;        /* number of entries */
 110         int                     max_size;       /* maximum size of entries */
 111         int                     rover;          /* rover for expire check */
 112         int                     counter;        /* counter for no expire */
 113         bool                    dead;
 114 };
 115
 116
 117 /*
 118  *      IPVS LBLC sysctl table
 119  */
 120 #ifdef CONFIG_SYSCTL
 121 static struct ctl_table vs_vars_table[] = {
 122         {
 123                 .procname       = "lblc_expiration",
 124                 .data           = NULL,
 125                 .maxlen         = sizeof(int),
 126                 .mode           = 0644,
 127                 .proc_handler   = proc_dointvec_jiffies,
 128         },
 129         { }
 130 };
 131 #endif
 132
 133 static void ip_vs_lblc_rcu_free(struct rcu_head *head)
 134 {
 135         struct ip_vs_lblc_entry *en = container_of(head,
 136                                                    struct ip_vs_lblc_entry,
 137                                                    rcu_head);
 138
 139         ip_vs_dest_put_and_free(en->dest);
 140         kfree(en);
 141 }
 142
 143 static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
 144 {
 145         hlist_del_rcu(&en->list);
 146         call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
 147 }
 148
 149 /*
 150  *      Returns hash value for IPVS LBLC entry
 151  */
 152 static inline unsigned int
 153 ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
 154 {
 155         __be32 addr_fold = addr->ip;
 156
 157 #ifdef CONFIG_IP_VS_IPV6
 158         if (af == AF_INET6)
 159                 addr_fold = addr->ip6[0]^addr->ip6[1]^
 160                             addr->ip6[2]^addr->ip6[3];
 161 #endif
 162         return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
 163 }
 164
 165
 166 /*
 167  *      Hash an entry in the ip_vs_lblc_table.
 168  *      returns bool success.
 169  */
 170 static void
 171 ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
 172 {
 173         unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
 174
 175         hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
 176         atomic_inc(&tbl->entries);
 177 }
 178
 179
 180 /* Get ip_vs_lblc_entry associated with supplied parameters. */
 181 static inline struct ip_vs_lblc_entry *
 182 ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
 183                const union nf_inet_addr *addr)
 184 {
 185         unsigned int hash = ip_vs_lblc_hashkey(af, addr);
 186         struct ip_vs_lblc_entry *en;
 187
 188         hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
 189                 if (ip_vs_addr_equal(af, &en->addr, addr))
 190                         return en;
 191
 192         return NULL;
 193 }
 194
 195
 196 /*
 197  * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
 198  * address to a server. Called under spin lock.
 199  */
 200 static inline struct ip_vs_lblc_entry *
 201 ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
 202                u16 af, struct ip_vs_dest *dest)
 203 {
 204         struct ip_vs_lblc_entry *en;
 205
 206         en = ip_vs_lblc_get(af, tbl, daddr);
 207         if (en) {
 208                 if (en->dest == dest)
 209                         return en;
 210                 ip_vs_lblc_del(en);
 211         }
 212         en = kmalloc(sizeof(*en), GFP_ATOMIC);
 213         if (!en)
 214                 return NULL;
 215
 216         en->af = af;
 217         ip_vs_addr_copy(af, &en->addr, daddr);
 218         en->lastuse = jiffies;
 219
 220         ip_vs_dest_hold(dest);
 221         en->dest = dest;
 222
 223         ip_vs_lblc_hash(tbl, en);
 224
 225         return en;
 226 }
 227
 228
 229 /*
 230  *      Flush all the entries of the specified table.
 231  */
 232 static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 233 {
 234         struct ip_vs_lblc_table *tbl = svc->sched_data;
 235         struct ip_vs_lblc_entry *en;
 236         struct hlist_node *next;
 237         int i;
 238
 239         spin_lock_bh(&svc->sched_lock);
 240         tbl->dead = 1;
 241         for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
 242                 hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 243                         ip_vs_lblc_del(en);
 244                         atomic_dec(&tbl->entries);
 245                 }
 246         }
 247         spin_unlock_bh(&svc->sched_lock);
 248 }
 249
 250 static int sysctl_lblc_expiration(struct ip_vs_service *svc)
 251 {
 252 #ifdef CONFIG_SYSCTL
 253         return svc->ipvs->sysctl_lblc_expiration;
 254 #else
 255         return DEFAULT_EXPIRATION;
 256 #endif
 257 }
 258
 259 static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
 260 {
 261         struct ip_vs_lblc_table *tbl = svc->sched_data;
 262         struct ip_vs_lblc_entry *en;
 263         struct hlist_node *next;
 264         unsigned long now = jiffies;
 265         int i, j;
 266
 267         for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
 268                 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 269
 270                 spin_lock(&svc->sched_lock);
 271                 hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 272                         if (time_before(now,
 273                                         en->lastuse +
 274                                         sysctl_lblc_expiration(svc)))
 275                                 continue;
 276
 277                         ip_vs_lblc_del(en);
 278                         atomic_dec(&tbl->entries);
 279                 }
 280                 spin_unlock(&svc->sched_lock);
 281         }
 282         tbl->rover = j;
 283 }
 284
 285
 286 /*
 287  *      Periodical timer handler for IPVS lblc table
 288  *      It is used to collect stale entries when the number of entries
 289  *      exceeds the maximum size of the table.
 290  *
 291  *      Fixme: we probably need more complicated algorithm to collect
 292  *             entries that have not been used for a long time even
 293  *             if the number of entries doesn't exceed the maximum size
 294  *             of the table.
 295  *      The full expiration check is for this purpose now.
 296  */
 297 static void ip_vs_lblc_check_expire(unsigned long data)
 298 {
 299         struct ip_vs_service *svc = (struct ip_vs_service *) data;
 300         struct ip_vs_lblc_table *tbl = svc->sched_data;
 301         unsigned long now = jiffies;
 302         int goal;
 303         int i, j;
 304         struct ip_vs_lblc_entry *en;
 305         struct hlist_node *next;
 306
 307         if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
 308                 /* do full expiration check */
 309                 ip_vs_lblc_full_check(svc);
 310                 tbl->counter = 1;
 311                 goto out;
 312         }
 313
 314         if (atomic_read(&tbl->entries) <= tbl->max_size) {
 315                 tbl->counter++;
 316                 goto out;
 317         }
 318
 319         goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
 320         if (goal > tbl->max_size/2)
 321                 goal = tbl->max_size/2;
 322
 323         for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
 324                 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
 325
 326                 spin_lock(&svc->sched_lock);
 327                 hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
 328                         if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
 329                                 continue;
 330
 331                         ip_vs_lblc_del(en);
 332                         atomic_dec(&tbl->entries);
 333                         goal--;
 334                 }
 335                 spin_unlock(&svc->sched_lock);
 336                 if (goal <= 0)
 337                         break;
 338         }
 339         tbl->rover = j;
 340
 341   out:
 342         mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
 343 }
 344
 345
 346 static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 347 {
 348         int i;
 349         struct ip_vs_lblc_table *tbl;
 350
 351         /*
 352          *    Allocate the ip_vs_lblc_table for this service
 353          */
 354         tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
 355         if (tbl == NULL)
 356                 return -ENOMEM;
 357
 358         svc->sched_data = tbl;
 359         IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
 360                   "current service\n", sizeof(*tbl));
 361
 362         /*
 363          *    Initialize the hash buckets
 364          */
 365         for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
 366                 INIT_HLIST_HEAD(&tbl->bucket[i]);
 367         }
 368         tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
 369         tbl->rover = 0;
 370         tbl->counter = 1;
 371         tbl->dead = 0;
 372
 373         /*
 374          *    Hook periodic timer for garbage collection
 375          */
 376         setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
 377                         (unsigned long)svc);
 378         mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
 379
 380         return 0;
 381 }
 382
 383
 384 static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
 385 {
 386         struct ip_vs_lblc_table *tbl = svc->sched_data;
 387
 388         /* remove periodic timer */
 389         del_timer_sync(&tbl->periodic_timer);
 390
 391         /* got to clean up table entries here */
 392         ip_vs_lblc_flush(svc);
 393
 394         /* release the table itself */
 395         kfree_rcu(tbl, rcu_head);
 396         IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
 397                   sizeof(*tbl));
 398 }
 399
 400
 401 static inline struct ip_vs_dest *
 402 __ip_vs_lblc_schedule(struct ip_vs_service *svc)
 403 {
 404         struct ip_vs_dest *dest, *least;
 405         int loh, doh;
 406
 407         /*
 408          * We use the following formula to estimate the load:
 409          *                (dest overhead) / dest->weight
 410          *
 411          * Remember -- no floats in kernel mode!!!
 412          * The comparison of h1*w2 > h2*w1 is equivalent to that of
 413          *                h1/w1 > h2/w2
 414          * if every weight is larger than zero.
 415          *
 416          * The server with weight=0 is quiesced and will not receive any
 417          * new connection.
 418          */
 419         list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
 420                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 421                         continue;
 422                 if (atomic_read(&dest->weight) > 0) {
 423                         least = dest;
 424                         loh = ip_vs_dest_conn_overhead(least);
 425                         goto nextstage;
 426                 }
 427         }
 428         return NULL;
 429
 430         /*
 431          *    Find the destination with the least load.
 432          */
 433   nextstage:
 434         list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
 435                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 436                         continue;
 437
 438                 doh = ip_vs_dest_conn_overhead(dest);
 439                 if ((__s64)loh * atomic_read(&dest->weight) >
 440                     (__s64)doh * atomic_read(&least->weight)) {
 441                         least = dest;
 442                         loh = doh;
 443                 }
 444         }
 445
 446         IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
 447                       "activeconns %d refcnt %d weight %d overhead %d\n",
 448                       IP_VS_DBG_ADDR(least->af, &least->addr),
 449                       ntohs(least->port),
 450                       atomic_read(&least->activeconns),
 451                       atomic_read(&least->refcnt),
 452                       atomic_read(&least->weight), loh);
 453
 454         return least;
 455 }
 456
 457
 458 /*
 459  *   If this destination server is overloaded and there is a less loaded
 460  *   server, then return true.
 461  */
 462 static inline int
 463 is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 464 {
 465         if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
 466                 struct ip_vs_dest *d;
 467
 468                 list_for_each_entry_rcu(d, &svc->destinations, n_list) {
 469                         if (atomic_read(&d->activeconns)*2
 470                             < atomic_read(&d->weight)) {
 471                                 return 1;
 472                         }
 473                 }
 474         }
 475         return 0;
 476 }
 477
 478
 479 /*
 480  *    Locality-Based (weighted) Least-Connection scheduling
 481  */
 482 static struct ip_vs_dest *
 483 ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 484                     struct ip_vs_iphdr *iph)
 485 {
 486         struct ip_vs_lblc_table *tbl = svc->sched_data;
 487         struct ip_vs_dest *dest = NULL;
 488         struct ip_vs_lblc_entry *en;
 489
 490         IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
 491
 492         /* First look in our cache */
 493         en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
 494         if (en) {
 495                 /* We only hold a read lock, but this is atomic */
 496                 en->lastuse = jiffies;
 497
 498                 /*
 499                  * If the destination is not available, i.e. it's in the trash,
 500                  * we must ignore it, as it may be removed from under our feet,
 501                  * if someone drops our reference count. Our caller only makes
 502                  * sure that destinations, that are not in the trash, are not
 503                  * moved to the trash, while we are scheduling. But anyone can
 504                  * free up entries from the trash at any time.
 505                  */
 506
 507                 dest = en->dest;
 508                 if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
 509                     atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
 510                         goto out;
 511         }
 512
 513         /* No cache entry or it is invalid, time to schedule */
 514         dest = __ip_vs_lblc_schedule(svc);
 515         if (!dest) {
 516                 ip_vs_scheduler_err(svc, "no destination available");
 517                 return NULL;
 518         }
 519
 520         /* If we fail to create a cache entry, we'll just use the valid dest */
 521         spin_lock_bh(&svc->sched_lock);
 522         if (!tbl->dead)
 523                 ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
 524         spin_unlock_bh(&svc->sched_lock);
 525
 526 out:
 527         IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
 528                       IP_VS_DBG_ADDR(svc->af, &iph->daddr),
 529                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
 530
 531         return dest;
 532 }
 533
 534
 535 /*
 536  *      IPVS LBLC Scheduler structure
 537  */
 538 static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
 539         .name =                 "lblc",
 540         .refcnt =               ATOMIC_INIT(0),
 541         .module =               THIS_MODULE,
 542         .n_list =               LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
 543         .init_service =         ip_vs_lblc_init_svc,
 544         .done_service =         ip_vs_lblc_done_svc,
 545         .schedule =             ip_vs_lblc_schedule,
 546 };
 547
 548 /*
 549  *  per netns init.
 550  */
 551 #ifdef CONFIG_SYSCTL
 552 static int __net_init __ip_vs_lblc_init(struct net *net)
 553 {
 554         struct netns_ipvs *ipvs = net_ipvs(net);
 555
 556         if (!ipvs)
 557                 return -ENOENT;
 558
 559         if (!net_eq(net, &init_net)) {
 560                 ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
 561                                                 sizeof(vs_vars_table),
 562                                                 GFP_KERNEL);
 563                 if (ipvs->lblc_ctl_table == NULL)
 564                         return -ENOMEM;
 565
 566                 /* Don't export sysctls to unprivileged users */
 567                 if (net->user_ns != &init_user_ns)
 568                         ipvs->lblc_ctl_table[0].procname = NULL;
 569
 570         } else
 571                 ipvs->lblc_ctl_table = vs_vars_table;
 572         ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
 573         ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
 574
 575         ipvs->lblc_ctl_header =
 576                 register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
 577         if (!ipvs->lblc_ctl_header) {
 578                 if (!net_eq(net, &init_net))
 579                         kfree(ipvs->lblc_ctl_table);
 580                 return -ENOMEM;
 581         }
 582
 583         return 0;
 584 }
 585
 586 static void __net_exit __ip_vs_lblc_exit(struct net *net)
 587 {
 588         struct netns_ipvs *ipvs = net_ipvs(net);
 589
 590         unregister_net_sysctl_table(ipvs->lblc_ctl_header);
 591
 592         if (!net_eq(net, &init_net))
 593                 kfree(ipvs->lblc_ctl_table);
 594 }
 595
 596 #else
 597
 598 static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
 599 static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
 600
 601 #endif
 602
 603 static struct pernet_operations ip_vs_lblc_ops = {
 604         .init = __ip_vs_lblc_init,
 605         .exit = __ip_vs_lblc_exit,
 606 };
 607
 608 static int __init ip_vs_lblc_init(void)
 609 {
 610         int ret;
 611
 612         ret = register_pernet_subsys(&ip_vs_lblc_ops);
 613         if (ret)
 614                 return ret;
 615
 616         ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
 617         if (ret)
 618                 unregister_pernet_subsys(&ip_vs_lblc_ops);
 619         return ret;
 620 }
 621
 622 static void __exit ip_vs_lblc_cleanup(void)
 623 {
 624         unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
 625         unregister_pernet_subsys(&ip_vs_lblc_ops);
 626         rcu_barrier();
 627 }
 628
 629
 630 module_init(ip_vs_lblc_init);
 631 module_exit(ip_vs_lblc_cleanup);
 632 MODULE_LICENSE("GPL");