net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/types.h>
  97 #include <linux/socket.h>
  98 #include <linux/in.h>
  99 #include <linux/kernel.h>
 100 #include <linux/module.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/sched.h>
 104 #include <linux/timer.h>
 105 #include <linux/string.h>
 106 #include <linux/sockios.h>
 107 #include <linux/net.h>
 108 #include <linux/mm.h>
 109 #include <linux/slab.h>
 110 #include <linux/interrupt.h>
 111 #include <linux/poll.h>
 112 #include <linux/tcp.h>
 113 #include <linux/init.h>
 114 #include <linux/highmem.h>
 115 #include <linux/user_namespace.h>
 116 #include <linux/static_key.h>
 117 #include <linux/memcontrol.h>
 118 #include <linux/prefetch.h>
 119
 120 #include <asm/uaccess.h>
 121
 122 #include <linux/netdevice.h>
 123 #include <net/protocol.h>
 124 #include <linux/skbuff.h>
 125 #include <net/net_namespace.h>
 126 #include <net/request_sock.h>
 127 #include <net/sock.h>
 128 #include <linux/net_tstamp.h>
 129 #include <net/xfrm.h>
 130 #include <linux/ipsec.h>
 131 #include <net/cls_cgroup.h>
 132 #include <net/netprio_cgroup.h>
 133
 134 #include <linux/filter.h>
 135
 136 #include <trace/events/sock.h>
 137
 138 #ifdef CONFIG_INET
 139 #include <net/tcp.h>
 140 #endif
 141
 142 #include <net/ll_poll.h>
 143
 144 static DEFINE_MUTEX(proto_list_mutex);
 145 static LIST_HEAD(proto_list);
 146
 147 #ifdef CONFIG_MEMCG_KMEM
 148 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 149 {
 150         struct proto *proto;
 151         int ret = 0;
 152
 153         mutex_lock(&proto_list_mutex);
 154         list_for_each_entry(proto, &proto_list, node) {
 155                 if (proto->init_cgroup) {
 156                         ret = proto->init_cgroup(memcg, ss);
 157                         if (ret)
 158                                 goto out;
 159                 }
 160         }
 161
 162         mutex_unlock(&proto_list_mutex);
 163         return ret;
 164 out:
 165         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 166                 if (proto->destroy_cgroup)
 167                         proto->destroy_cgroup(memcg);
 168         mutex_unlock(&proto_list_mutex);
 169         return ret;
 170 }
 171
 172 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 173 {
 174         struct proto *proto;
 175
 176         mutex_lock(&proto_list_mutex);
 177         list_for_each_entry_reverse(proto, &proto_list, node)
 178                 if (proto->destroy_cgroup)
 179                         proto->destroy_cgroup(memcg);
 180         mutex_unlock(&proto_list_mutex);
 181 }
 182 #endif
 183
 184 /*
 185  * Each address family might have different locking rules, so we have
 186  * one slock key per address family:
 187  */
 188 static struct lock_class_key af_family_keys[AF_MAX];
 189 static struct lock_class_key af_family_slock_keys[AF_MAX];
 190
 191 #if defined(CONFIG_MEMCG_KMEM)
 192 struct static_key memcg_socket_limit_enabled;
 193 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 194 #endif
 195
 196 /*
 197  * Make lock validator output more readable. (we pre-construct these
 198  * strings build-time, so that runtime initialization of socket
 199  * locks is fast):
 200  */
 201 static const char *const af_family_key_strings[AF_MAX+1] = {
 202   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 203   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 204   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 205   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 206   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 207   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 208   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 209   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 210   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 211   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 212   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 213   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 214   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 215   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 216 };
 217 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 218   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 219   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 220   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 221   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 222   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 223   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 224   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 225   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 226   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 227   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 228   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 229   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 230   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 231   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 232 };
 233 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 234   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 235   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 236   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 237   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 238   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 239   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 240   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 241   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 242   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 243   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 244   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 245   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 246   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 247   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 248 };
 249
 250 /*
 251  * sk_callback_lock locking rules are per-address-family,
 252  * so split the lock classes by using a per-AF key:
 253  */
 254 static struct lock_class_key af_callback_keys[AF_MAX];
 255
 256 /* Take into consideration the size of the struct sk_buff overhead in the
 257  * determination of these values, since that is non-constant across
 258  * platforms.  This makes socket queueing behavior and performance
 259  * not depend upon such differences.
 260  */
 261 #define _SK_MEM_PACKETS         256
 262 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 263 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 264 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 265
 266 /* Run time adjustable parameters. */
 267 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 268 EXPORT_SYMBOL(sysctl_wmem_max);
 269 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 270 EXPORT_SYMBOL(sysctl_rmem_max);
 271 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 272 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 273
 274 /* Maximal space eaten by iovec or ancillary data plus some space */
 275 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 276 EXPORT_SYMBOL(sysctl_optmem_max);
 277
 278 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 279 EXPORT_SYMBOL_GPL(memalloc_socks);
 280
 281 /**
 282  * sk_set_memalloc - sets %SOCK_MEMALLOC
 283  * @sk: socket to set it on
 284  *
 285  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 286  * It's the responsibility of the admin to adjust min_free_kbytes
 287  * to meet the requirements
 288  */
 289 void sk_set_memalloc(struct sock *sk)
 290 {
 291         sock_set_flag(sk, SOCK_MEMALLOC);
 292         sk->sk_allocation |= __GFP_MEMALLOC;
 293         static_key_slow_inc(&memalloc_socks);
 294 }
 295 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 296
 297 void sk_clear_memalloc(struct sock *sk)
 298 {
 299         sock_reset_flag(sk, SOCK_MEMALLOC);
 300         sk->sk_allocation &= ~__GFP_MEMALLOC;
 301         static_key_slow_dec(&memalloc_socks);
 302
 303         /*
 304          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 305          * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 306          * it has rmem allocations there is a risk that the user of the
 307          * socket cannot make forward progress due to exceeding the rmem
 308          * limits. By rights, sk_clear_memalloc() should only be called
 309          * on sockets being torn down but warn and reset the accounting if
 310          * that assumption breaks.
 311          */
 312         if (WARN_ON(sk->sk_forward_alloc))
 313                 sk_mem_reclaim(sk);
 314 }
 315 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 316
 317 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 318 {
 319         int ret;
 320         unsigned long pflags = current->flags;
 321
 322         /* these should have been dropped before queueing */
 323         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 324
 325         current->flags |= PF_MEMALLOC;
 326         ret = sk->sk_backlog_rcv(sk, skb);
 327         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 328
 329         return ret;
 330 }
 331 EXPORT_SYMBOL(__sk_backlog_rcv);
 332
 333 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 334 {
 335         struct timeval tv;
 336
 337         if (optlen < sizeof(tv))
 338                 return -EINVAL;
 339         if (copy_from_user(&tv, optval, sizeof(tv)))
 340                 return -EFAULT;
 341         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 342                 return -EDOM;
 343
 344         if (tv.tv_sec < 0) {
 345                 static int warned __read_mostly;
 346
 347                 *timeo_p = 0;
 348                 if (warned < 10 && net_ratelimit()) {
 349                         warned++;
 350                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 351                                 __func__, current->comm, task_pid_nr(current));
 352                 }
 353                 return 0;
 354         }
 355         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 356         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 357                 return 0;
 358         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 359                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 360         return 0;
 361 }
 362
 363 static void sock_warn_obsolete_bsdism(const char *name)
 364 {
 365         static int warned;
 366         static char warncomm[TASK_COMM_LEN];
 367         if (strcmp(warncomm, current->comm) && warned < 5) {
 368                 strcpy(warncomm,  current->comm);
 369                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 370                         warncomm, name);
 371                 warned++;
 372         }
 373 }
 374
 375 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 376
 377 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 378 {
 379         if (sk->sk_flags & flags) {
 380                 sk->sk_flags &= ~flags;
 381                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 382                         net_disable_timestamp();
 383         }
 384 }
 385
 386
 387 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 388 {
 389         int err;
 390         int skb_len;
 391         unsigned long flags;
 392         struct sk_buff_head *list = &sk->sk_receive_queue;
 393
 394         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 395                 atomic_inc(&sk->sk_drops);
 396                 trace_sock_rcvqueue_full(sk, skb);
 397                 return -ENOMEM;
 398         }
 399
 400         err = sk_filter(sk, skb);
 401         if (err)
 402                 return err;
 403
 404         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 405                 atomic_inc(&sk->sk_drops);
 406                 return -ENOBUFS;
 407         }
 408
 409         skb->dev = NULL;
 410         skb_set_owner_r(skb, sk);
 411
 412         /* Cache the SKB length before we tack it onto the receive
 413          * queue.  Once it is added it no longer belongs to us and
 414          * may be freed by other threads of control pulling packets
 415          * from the queue.
 416          */
 417         skb_len = skb->len;
 418
 419         /* we escape from rcu protected region, make sure we dont leak
 420          * a norefcounted dst
 421          */
 422         skb_dst_force(skb);
 423
 424         spin_lock_irqsave(&list->lock, flags);
 425         skb->dropcount = atomic_read(&sk->sk_drops);
 426         __skb_queue_tail(list, skb);
 427         spin_unlock_irqrestore(&list->lock, flags);
 428
 429         if (!sock_flag(sk, SOCK_DEAD))
 430                 sk->sk_data_ready(sk, skb_len);
 431         return 0;
 432 }
 433 EXPORT_SYMBOL(sock_queue_rcv_skb);
 434
 435 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 436 {
 437         int rc = NET_RX_SUCCESS;
 438
 439         if (sk_filter(sk, skb))
 440                 goto discard_and_relse;
 441
 442         skb->dev = NULL;
 443
 444         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 445                 atomic_inc(&sk->sk_drops);
 446                 goto discard_and_relse;
 447         }
 448         if (nested)
 449                 bh_lock_sock_nested(sk);
 450         else
 451                 bh_lock_sock(sk);
 452         if (!sock_owned_by_user(sk)) {
 453                 /*
 454                  * trylock + unlock semantics:
 455                  */
 456                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 457
 458                 rc = sk_backlog_rcv(sk, skb);
 459
 460                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 461         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 462                 bh_unlock_sock(sk);
 463                 atomic_inc(&sk->sk_drops);
 464                 goto discard_and_relse;
 465         }
 466
 467         bh_unlock_sock(sk);
 468 out:
 469         sock_put(sk);
 470         return rc;
 471 discard_and_relse:
 472         kfree_skb(skb);
 473         goto out;
 474 }
 475 EXPORT_SYMBOL(sk_receive_skb);
 476
 477 void sk_reset_txq(struct sock *sk)
 478 {
 479         sk_tx_queue_clear(sk);
 480 }
 481 EXPORT_SYMBOL(sk_reset_txq);
 482
 483 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 484 {
 485         struct dst_entry *dst = __sk_dst_get(sk);
 486
 487         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 488                 sk_tx_queue_clear(sk);
 489                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 490                 dst_release(dst);
 491                 return NULL;
 492         }
 493
 494         return dst;
 495 }
 496 EXPORT_SYMBOL(__sk_dst_check);
 497
 498 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 499 {
 500         struct dst_entry *dst = sk_dst_get(sk);
 501
 502         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 503                 sk_dst_reset(sk);
 504                 dst_release(dst);
 505                 return NULL;
 506         }
 507
 508         return dst;
 509 }
 510 EXPORT_SYMBOL(sk_dst_check);
 511
 512 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 513                                 int optlen)
 514 {
 515         int ret = -ENOPROTOOPT;
 516 #ifdef CONFIG_NETDEVICES
 517         struct net *net = sock_net(sk);
 518         char devname[IFNAMSIZ];
 519         int index;
 520
 521         /* Sorry... */
 522         ret = -EPERM;
 523         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 524                 goto out;
 525
 526         ret = -EINVAL;
 527         if (optlen < 0)
 528                 goto out;
 529
 530         /* Bind this socket to a particular device like "eth0",
 531          * as specified in the passed interface name. If the
 532          * name is "" or the option length is zero the socket
 533          * is not bound.
 534          */
 535         if (optlen > IFNAMSIZ - 1)
 536                 optlen = IFNAMSIZ - 1;
 537         memset(devname, 0, sizeof(devname));
 538
 539         ret = -EFAULT;
 540         if (copy_from_user(devname, optval, optlen))
 541                 goto out;
 542
 543         index = 0;
 544         if (devname[0] != '\0') {
 545                 struct net_device *dev;
 546
 547                 rcu_read_lock();
 548                 dev = dev_get_by_name_rcu(net, devname);
 549                 if (dev)
 550                         index = dev->ifindex;
 551                 rcu_read_unlock();
 552                 ret = -ENODEV;
 553                 if (!dev)
 554                         goto out;
 555         }
 556
 557         lock_sock(sk);
 558         sk->sk_bound_dev_if = index;
 559         sk_dst_reset(sk);
 560         release_sock(sk);
 561
 562         ret = 0;
 563
 564 out:
 565 #endif
 566
 567         return ret;
 568 }
 569
 570 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 571                                 int __user *optlen, int len)
 572 {
 573         int ret = -ENOPROTOOPT;
 574 #ifdef CONFIG_NETDEVICES
 575         struct net *net = sock_net(sk);
 576         struct net_device *dev;
 577         char devname[IFNAMSIZ];
 578         unsigned seq;
 579
 580         if (sk->sk_bound_dev_if == 0) {
 581                 len = 0;
 582                 goto zero;
 583         }
 584
 585         ret = -EINVAL;
 586         if (len < IFNAMSIZ)
 587                 goto out;
 588
 589 retry:
 590         seq = read_seqcount_begin(&devnet_rename_seq);
 591         rcu_read_lock();
 592         dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
 593         ret = -ENODEV;
 594         if (!dev) {
 595                 rcu_read_unlock();
 596                 goto out;
 597         }
 598
 599         strcpy(devname, dev->name);
 600         rcu_read_unlock();
 601         if (read_seqcount_retry(&devnet_rename_seq, seq))
 602                 goto retry;
 603
 604         len = strlen(devname) + 1;
 605
 606         ret = -EFAULT;
 607         if (copy_to_user(optval, devname, len))
 608                 goto out;
 609
 610 zero:
 611         ret = -EFAULT;
 612         if (put_user(len, optlen))
 613                 goto out;
 614
 615         ret = 0;
 616
 617 out:
 618 #endif
 619
 620         return ret;
 621 }
 622
 623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 624 {
 625         if (valbool)
 626                 sock_set_flag(sk, bit);
 627         else
 628                 sock_reset_flag(sk, bit);
 629 }
 630
 631 /*
 632  *      This is meant for all protocols to use and covers goings on
 633  *      at the socket level. Everything here is generic.
 634  */
 635
 636 int sock_setsockopt(struct socket *sock, int level, int optname,
 637                     char __user *optval, unsigned int optlen)
 638 {
 639         struct sock *sk = sock->sk;
 640         int val;
 641         int valbool;
 642         struct linger ling;
 643         int ret = 0;
 644
 645         /*
 646          *      Options without arguments
 647          */
 648
 649         if (optname == SO_BINDTODEVICE)
 650                 return sock_setbindtodevice(sk, optval, optlen);
 651
 652         if (optlen < sizeof(int))
 653                 return -EINVAL;
 654
 655         if (get_user(val, (int __user *)optval))
 656                 return -EFAULT;
 657
 658         valbool = val ? 1 : 0;
 659
 660         lock_sock(sk);
 661
 662         switch (optname) {
 663         case SO_DEBUG:
 664                 if (val && !capable(CAP_NET_ADMIN))
 665                         ret = -EACCES;
 666                 else
 667                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 668                 break;
 669         case SO_REUSEADDR:
 670                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 671                 break;
 672         case SO_REUSEPORT:
 673                 sk->sk_reuseport = valbool;
 674                 break;
 675         case SO_TYPE:
 676         case SO_PROTOCOL:
 677         case SO_DOMAIN:
 678         case SO_ERROR:
 679                 ret = -ENOPROTOOPT;
 680                 break;
 681         case SO_DONTROUTE:
 682                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 683                 break;
 684         case SO_BROADCAST:
 685                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 686                 break;
 687         case SO_SNDBUF:
 688                 /* Don't error on this BSD doesn't and if you think
 689                  * about it this is right. Otherwise apps have to
 690                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 691                  * are treated in BSD as hints
 692                  */
 693                 val = min_t(u32, val, sysctl_wmem_max);
 694 set_sndbuf:
 695                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 696                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 697                 /* Wake up sending tasks if we upped the value. */
 698                 sk->sk_write_space(sk);
 699                 break;
 700
 701         case SO_SNDBUFFORCE:
 702                 if (!capable(CAP_NET_ADMIN)) {
 703                         ret = -EPERM;
 704                         break;
 705                 }
 706                 goto set_sndbuf;
 707
 708         case SO_RCVBUF:
 709                 /* Don't error on this BSD doesn't and if you think
 710                  * about it this is right. Otherwise apps have to
 711                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 712                  * are treated in BSD as hints
 713                  */
 714                 val = min_t(u32, val, sysctl_rmem_max);
 715 set_rcvbuf:
 716                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 717                 /*
 718                  * We double it on the way in to account for
 719                  * "struct sk_buff" etc. overhead.   Applications
 720                  * assume that the SO_RCVBUF setting they make will
 721                  * allow that much actual data to be received on that
 722                  * socket.
 723                  *
 724                  * Applications are unaware that "struct sk_buff" and
 725                  * other overheads allocate from the receive buffer
 726                  * during socket buffer allocation.
 727                  *
 728                  * And after considering the possible alternatives,
 729                  * returning the value we actually used in getsockopt
 730                  * is the most desirable behavior.
 731                  */
 732                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 733                 break;
 734
 735         case SO_RCVBUFFORCE:
 736                 if (!capable(CAP_NET_ADMIN)) {
 737                         ret = -EPERM;
 738                         break;
 739                 }
 740                 goto set_rcvbuf;
 741
 742         case SO_KEEPALIVE:
 743 #ifdef CONFIG_INET
 744                 if (sk->sk_protocol == IPPROTO_TCP &&
 745                     sk->sk_type == SOCK_STREAM)
 746                         tcp_set_keepalive(sk, valbool);
 747 #endif
 748                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 749                 break;
 750
 751         case SO_OOBINLINE:
 752                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 753                 break;
 754
 755         case SO_NO_CHECK:
 756                 sk->sk_no_check = valbool;
 757                 break;
 758
 759         case SO_PRIORITY:
 760                 if ((val >= 0 && val <= 6) ||
 761                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 762                         sk->sk_priority = val;
 763                 else
 764                         ret = -EPERM;
 765                 break;
 766
 767         case SO_LINGER:
 768                 if (optlen < sizeof(ling)) {
 769                         ret = -EINVAL;  /* 1003.1g */
 770                         break;
 771                 }
 772                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 773                         ret = -EFAULT;
 774                         break;
 775                 }
 776                 if (!ling.l_onoff)
 777                         sock_reset_flag(sk, SOCK_LINGER);
 778                 else {
 779 #if (BITS_PER_LONG == 32)
 780                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 781                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 782                         else
 783 #endif
 784                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 785                         sock_set_flag(sk, SOCK_LINGER);
 786                 }
 787                 break;
 788
 789         case SO_BSDCOMPAT:
 790                 sock_warn_obsolete_bsdism("setsockopt");
 791                 break;
 792
 793         case SO_PASSCRED:
 794                 if (valbool)
 795                         set_bit(SOCK_PASSCRED, &sock->flags);
 796                 else
 797                         clear_bit(SOCK_PASSCRED, &sock->flags);
 798                 break;
 799
 800         case SO_TIMESTAMP:
 801         case SO_TIMESTAMPNS:
 802                 if (valbool)  {
 803                         if (optname == SO_TIMESTAMP)
 804                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 805                         else
 806                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 807                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 808                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 809                 } else {
 810                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 811                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 812                 }
 813                 break;
 814
 815         case SO_TIMESTAMPING:
 816                 if (val & ~SOF_TIMESTAMPING_MASK) {
 817                         ret = -EINVAL;
 818                         break;
 819                 }
 820                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 821                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 822                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 823                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 824                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 825                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 826                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 827                         sock_enable_timestamp(sk,
 828                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 829                 else
 830                         sock_disable_timestamp(sk,
 831                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 832                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 833                                   val & SOF_TIMESTAMPING_SOFTWARE);
 834                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 835                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 836                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 837                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 838                 break;
 839
 840         case SO_RCVLOWAT:
 841                 if (val < 0)
 842                         val = INT_MAX;
 843                 sk->sk_rcvlowat = val ? : 1;
 844                 break;
 845
 846         case SO_RCVTIMEO:
 847                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 848                 break;
 849
 850         case SO_SNDTIMEO:
 851                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 852                 break;
 853
 854         case SO_ATTACH_FILTER:
 855                 ret = -EINVAL;
 856                 if (optlen == sizeof(struct sock_fprog)) {
 857                         struct sock_fprog fprog;
 858
 859                         ret = -EFAULT;
 860                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 861                                 break;
 862
 863                         ret = sk_attach_filter(&fprog, sk);
 864                 }
 865                 break;
 866
 867         case SO_DETACH_FILTER:
 868                 ret = sk_detach_filter(sk);
 869                 break;
 870
 871         case SO_LOCK_FILTER:
 872                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 873                         ret = -EPERM;
 874                 else
 875                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 876                 break;
 877
 878         case SO_PASSSEC:
 879                 if (valbool)
 880                         set_bit(SOCK_PASSSEC, &sock->flags);
 881                 else
 882                         clear_bit(SOCK_PASSSEC, &sock->flags);
 883                 break;
 884         case SO_MARK:
 885                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 886                         ret = -EPERM;
 887                 else
 888                         sk->sk_mark = val;
 889                 break;
 890
 891                 /* We implement the SO_SNDLOWAT etc to
 892                    not be settable (1003.1g 5.3) */
 893         case SO_RXQ_OVFL:
 894                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 895                 break;
 896
 897         case SO_WIFI_STATUS:
 898                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 899                 break;
 900
 901         case SO_PEEK_OFF:
 902                 if (sock->ops->set_peek_off)
 903                         sock->ops->set_peek_off(sk, val);
 904                 else
 905                         ret = -EOPNOTSUPP;
 906                 break;
 907
 908         case SO_NOFCS:
 909                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 910                 break;
 911
 912         case SO_SELECT_ERR_QUEUE:
 913                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 914                 break;
 915
 916 #ifdef CONFIG_NET_LL_RX_POLL
 917         case SO_LL:
 918                 /* allow unprivileged users to decrease the value */
 919                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 920                         ret = -EPERM;
 921                 else {
 922                         if (val < 0)
 923                                 ret = -EINVAL;
 924                         else
 925                                 sk->sk_ll_usec = val;
 926                 }
 927                 break;
 928 #endif
 929         default:
 930                 ret = -ENOPROTOOPT;
 931                 break;
 932         }
 933         release_sock(sk);
 934         return ret;
 935 }
 936 EXPORT_SYMBOL(sock_setsockopt);
 937
 938
 939 void cred_to_ucred(struct pid *pid, const struct cred *cred,
 940                    struct ucred *ucred)
 941 {
 942         ucred->pid = pid_vnr(pid);
 943         ucred->uid = ucred->gid = -1;
 944         if (cred) {
 945                 struct user_namespace *current_ns = current_user_ns();
 946
 947                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
 948                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
 949         }
 950 }
 951 EXPORT_SYMBOL_GPL(cred_to_ucred);
 952
 953 int sock_getsockopt(struct socket *sock, int level, int optname,
 954                     char __user *optval, int __user *optlen)
 955 {
 956         struct sock *sk = sock->sk;
 957
 958         union {
 959                 int val;
 960                 struct linger ling;
 961                 struct timeval tm;
 962         } v;
 963
 964         int lv = sizeof(int);
 965         int len;
 966
 967         if (get_user(len, optlen))
 968                 return -EFAULT;
 969         if (len < 0)
 970                 return -EINVAL;
 971
 972         memset(&v, 0, sizeof(v));
 973
 974         switch (optname) {
 975         case SO_DEBUG:
 976                 v.val = sock_flag(sk, SOCK_DBG);
 977                 break;
 978
 979         case SO_DONTROUTE:
 980                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 981                 break;
 982
 983         case SO_BROADCAST:
 984                 v.val = sock_flag(sk, SOCK_BROADCAST);
 985                 break;
 986
 987         case SO_SNDBUF:
 988                 v.val = sk->sk_sndbuf;
 989                 break;
 990
 991         case SO_RCVBUF:
 992                 v.val = sk->sk_rcvbuf;
 993                 break;
 994
 995         case SO_REUSEADDR:
 996                 v.val = sk->sk_reuse;
 997                 break;
 998
 999         case SO_REUSEPORT:
1000                 v.val = sk->sk_reuseport;
1001                 break;
1002
1003         case SO_KEEPALIVE:
1004                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1005                 break;
1006
1007         case SO_TYPE:
1008                 v.val = sk->sk_type;
1009                 break;
1010
1011         case SO_PROTOCOL:
1012                 v.val = sk->sk_protocol;
1013                 break;
1014
1015         case SO_DOMAIN:
1016                 v.val = sk->sk_family;
1017                 break;
1018
1019         case SO_ERROR:
1020                 v.val = -sock_error(sk);
1021                 if (v.val == 0)
1022                         v.val = xchg(&sk->sk_err_soft, 0);
1023                 break;
1024
1025         case SO_OOBINLINE:
1026                 v.val = sock_flag(sk, SOCK_URGINLINE);
1027                 break;
1028
1029         case SO_NO_CHECK:
1030                 v.val = sk->sk_no_check;
1031                 break;
1032
1033         case SO_PRIORITY:
1034                 v.val = sk->sk_priority;
1035                 break;
1036
1037         case SO_LINGER:
1038                 lv              = sizeof(v.ling);
1039                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1040                 v.ling.l_linger = sk->sk_lingertime / HZ;
1041                 break;
1042
1043         case SO_BSDCOMPAT:
1044                 sock_warn_obsolete_bsdism("getsockopt");
1045                 break;
1046
1047         case SO_TIMESTAMP:
1048                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1049                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1050                 break;
1051
1052         case SO_TIMESTAMPNS:
1053                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1054                 break;
1055
1056         case SO_TIMESTAMPING:
1057                 v.val = 0;
1058                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1059                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1060                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1061                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1062                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1063                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1064                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1065                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1066                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1067                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
1068                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1069                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1070                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1071                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1072                 break;
1073
1074         case SO_RCVTIMEO:
1075                 lv = sizeof(struct timeval);
1076                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1077                         v.tm.tv_sec = 0;
1078                         v.tm.tv_usec = 0;
1079                 } else {
1080                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1081                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1082                 }
1083                 break;
1084
1085         case SO_SNDTIMEO:
1086                 lv = sizeof(struct timeval);
1087                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1088                         v.tm.tv_sec = 0;
1089                         v.tm.tv_usec = 0;
1090                 } else {
1091                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1092                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1093                 }
1094                 break;
1095
1096         case SO_RCVLOWAT:
1097                 v.val = sk->sk_rcvlowat;
1098                 break;
1099
1100         case SO_SNDLOWAT:
1101                 v.val = 1;
1102                 break;
1103
1104         case SO_PASSCRED:
1105                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1106                 break;
1107
1108         case SO_PEERCRED:
1109         {
1110                 struct ucred peercred;
1111                 if (len > sizeof(peercred))
1112                         len = sizeof(peercred);
1113                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1114                 if (copy_to_user(optval, &peercred, len))
1115                         return -EFAULT;
1116                 goto lenout;
1117         }
1118
1119         case SO_PEERNAME:
1120         {
1121                 char address[128];
1122
1123                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1124                         return -ENOTCONN;
1125                 if (lv < len)
1126                         return -EINVAL;
1127                 if (copy_to_user(optval, address, len))
1128                         return -EFAULT;
1129                 goto lenout;
1130         }
1131
1132         /* Dubious BSD thing... Probably nobody even uses it, but
1133          * the UNIX standard wants it for whatever reason... -DaveM
1134          */
1135         case SO_ACCEPTCONN:
1136                 v.val = sk->sk_state == TCP_LISTEN;
1137                 break;
1138
1139         case SO_PASSSEC:
1140                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1141                 break;
1142
1143         case SO_PEERSEC:
1144                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1145
1146         case SO_MARK:
1147                 v.val = sk->sk_mark;
1148                 break;
1149
1150         case SO_RXQ_OVFL:
1151                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1152                 break;
1153
1154         case SO_WIFI_STATUS:
1155                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1156                 break;
1157
1158         case SO_PEEK_OFF:
1159                 if (!sock->ops->set_peek_off)
1160                         return -EOPNOTSUPP;
1161
1162                 v.val = sk->sk_peek_off;
1163                 break;
1164         case SO_NOFCS:
1165                 v.val = sock_flag(sk, SOCK_NOFCS);
1166                 break;
1167
1168         case SO_BINDTODEVICE:
1169                 return sock_getbindtodevice(sk, optval, optlen, len);
1170
1171         case SO_GET_FILTER:
1172                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1173                 if (len < 0)
1174                         return len;
1175
1176                 goto lenout;
1177
1178         case SO_LOCK_FILTER:
1179                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1180                 break;
1181
1182         case SO_SELECT_ERR_QUEUE:
1183                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1184                 break;
1185
1186 #ifdef CONFIG_NET_LL_RX_POLL
1187         case SO_LL:
1188                 v.val = sk->sk_ll_usec;
1189                 break;
1190 #endif
1191
1192         default:
1193                 return -ENOPROTOOPT;
1194         }
1195
1196         if (len > lv)
1197                 len = lv;
1198         if (copy_to_user(optval, &v, len))
1199                 return -EFAULT;
1200 lenout:
1201         if (put_user(len, optlen))
1202                 return -EFAULT;
1203         return 0;
1204 }
1205
1206 /*
1207  * Initialize an sk_lock.
1208  *
1209  * (We also register the sk_lock with the lock validator.)
1210  */
1211 static inline void sock_lock_init(struct sock *sk)
1212 {
1213         sock_lock_init_class_and_name(sk,
1214                         af_family_slock_key_strings[sk->sk_family],
1215                         af_family_slock_keys + sk->sk_family,
1216                         af_family_key_strings[sk->sk_family],
1217                         af_family_keys + sk->sk_family);
1218 }
1219
1220 /*
1221  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1222  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1223  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1224  */
1225 static void sock_copy(struct sock *nsk, const struct sock *osk)
1226 {
1227 #ifdef CONFIG_SECURITY_NETWORK
1228         void *sptr = nsk->sk_security;
1229 #endif
1230         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1231
1232         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1233                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1234
1235 #ifdef CONFIG_SECURITY_NETWORK
1236         nsk->sk_security = sptr;
1237         security_sk_clone(osk, nsk);
1238 #endif
1239 }
1240
1241 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1242 {
1243         unsigned long nulls1, nulls2;
1244
1245         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1246         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1247         if (nulls1 > nulls2)
1248                 swap(nulls1, nulls2);
1249
1250         if (nulls1 != 0)
1251                 memset((char *)sk, 0, nulls1);
1252         memset((char *)sk + nulls1 + sizeof(void *), 0,
1253                nulls2 - nulls1 - sizeof(void *));
1254         memset((char *)sk + nulls2 + sizeof(void *), 0,
1255                size - nulls2 - sizeof(void *));
1256 }
1257 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1258
1259 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1260                 int family)
1261 {
1262         struct sock *sk;
1263         struct kmem_cache *slab;
1264
1265         slab = prot->slab;
1266         if (slab != NULL) {
1267                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1268                 if (!sk)
1269                         return sk;
1270                 if (priority & __GFP_ZERO) {
1271                         if (prot->clear_sk)
1272                                 prot->clear_sk(sk, prot->obj_size);
1273                         else
1274                                 sk_prot_clear_nulls(sk, prot->obj_size);
1275                 }
1276         } else
1277                 sk = kmalloc(prot->obj_size, priority);
1278
1279         if (sk != NULL) {
1280                 kmemcheck_annotate_bitfield(sk, flags);
1281
1282                 if (security_sk_alloc(sk, family, priority))
1283                         goto out_free;
1284
1285                 if (!try_module_get(prot->owner))
1286                         goto out_free_sec;
1287                 sk_tx_queue_clear(sk);
1288         }
1289
1290         return sk;
1291
1292 out_free_sec:
1293         security_sk_free(sk);
1294 out_free:
1295         if (slab != NULL)
1296                 kmem_cache_free(slab, sk);
1297         else
1298                 kfree(sk);
1299         return NULL;
1300 }
1301
1302 static void sk_prot_free(struct proto *prot, struct sock *sk)
1303 {
1304         struct kmem_cache *slab;
1305         struct module *owner;
1306
1307         owner = prot->owner;
1308         slab = prot->slab;
1309
1310         security_sk_free(sk);
1311         if (slab != NULL)
1312                 kmem_cache_free(slab, sk);
1313         else
1314                 kfree(sk);
1315         module_put(owner);
1316 }
1317
1318 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1319 void sock_update_classid(struct sock *sk)
1320 {
1321         u32 classid;
1322
1323         classid = task_cls_classid(current);
1324         if (classid != sk->sk_classid)
1325                 sk->sk_classid = classid;
1326 }
1327 EXPORT_SYMBOL(sock_update_classid);
1328 #endif
1329
1330 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1331 void sock_update_netprioidx(struct sock *sk)
1332 {
1333         if (in_interrupt())
1334                 return;
1335
1336         sk->sk_cgrp_prioidx = task_netprioidx(current);
1337 }
1338 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1339 #endif
1340
1341 /**
1342  *      sk_alloc - All socket objects are allocated here
1343  *      @net: the applicable net namespace
1344  *      @family: protocol family
1345  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1346  *      @prot: struct proto associated with this new sock instance
1347  */
1348 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1349                       struct proto *prot)
1350 {
1351         struct sock *sk;
1352
1353         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1354         if (sk) {
1355                 sk->sk_family = family;
1356                 /*
1357                  * See comment in struct sock definition to understand
1358                  * why we need sk_prot_creator -acme
1359                  */
1360                 sk->sk_prot = sk->sk_prot_creator = prot;
1361                 sock_lock_init(sk);
1362                 sock_net_set(sk, get_net(net));
1363                 atomic_set(&sk->sk_wmem_alloc, 1);
1364
1365                 sock_update_classid(sk);
1366                 sock_update_netprioidx(sk);
1367         }
1368
1369         return sk;
1370 }
1371 EXPORT_SYMBOL(sk_alloc);
1372
1373 static void __sk_free(struct sock *sk)
1374 {
1375         struct sk_filter *filter;
1376
1377         if (sk->sk_destruct)
1378                 sk->sk_destruct(sk);
1379
1380         filter = rcu_dereference_check(sk->sk_filter,
1381                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1382         if (filter) {
1383                 sk_filter_uncharge(sk, filter);
1384                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1385         }
1386
1387         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1388
1389         if (atomic_read(&sk->sk_omem_alloc))
1390                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1391                          __func__, atomic_read(&sk->sk_omem_alloc));
1392
1393         if (sk->sk_peer_cred)
1394                 put_cred(sk->sk_peer_cred);
1395         put_pid(sk->sk_peer_pid);
1396         put_net(sock_net(sk));
1397         sk_prot_free(sk->sk_prot_creator, sk);
1398 }
1399
1400 void sk_free(struct sock *sk)
1401 {
1402         /*
1403          * We subtract one from sk_wmem_alloc and can know if
1404          * some packets are still in some tx queue.
1405          * If not null, sock_wfree() will call __sk_free(sk) later
1406          */
1407         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1408                 __sk_free(sk);
1409 }
1410 EXPORT_SYMBOL(sk_free);
1411
1412 /*
1413  * Last sock_put should drop reference to sk->sk_net. It has already
1414  * been dropped in sk_change_net. Taking reference to stopping namespace
1415  * is not an option.
1416  * Take reference to a socket to remove it from hash _alive_ and after that
1417  * destroy it in the context of init_net.
1418  */
1419 void sk_release_kernel(struct sock *sk)
1420 {
1421         if (sk == NULL || sk->sk_socket == NULL)
1422                 return;
1423
1424         sock_hold(sk);
1425         sock_release(sk->sk_socket);
1426         release_net(sock_net(sk));
1427         sock_net_set(sk, get_net(&init_net));
1428         sock_put(sk);
1429 }
1430 EXPORT_SYMBOL(sk_release_kernel);
1431
1432 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1433 {
1434         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1435                 sock_update_memcg(newsk);
1436 }
1437
1438 /**
1439  *      sk_clone_lock - clone a socket, and lock its clone
1440  *      @sk: the socket to clone
1441  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1442  *
1443  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1444  */
1445 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1446 {
1447         struct sock *newsk;
1448
1449         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1450         if (newsk != NULL) {
1451                 struct sk_filter *filter;
1452
1453                 sock_copy(newsk, sk);
1454
1455                 /* SANITY */
1456                 get_net(sock_net(newsk));
1457                 sk_node_init(&newsk->sk_node);
1458                 sock_lock_init(newsk);
1459                 bh_lock_sock(newsk);
1460                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1461                 newsk->sk_backlog.len = 0;
1462
1463                 atomic_set(&newsk->sk_rmem_alloc, 0);
1464                 /*
1465                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1466                  */
1467                 atomic_set(&newsk->sk_wmem_alloc, 1);
1468                 atomic_set(&newsk->sk_omem_alloc, 0);
1469                 skb_queue_head_init(&newsk->sk_receive_queue);
1470                 skb_queue_head_init(&newsk->sk_write_queue);
1471 #ifdef CONFIG_NET_DMA
1472                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1473 #endif
1474
1475                 spin_lock_init(&newsk->sk_dst_lock);
1476                 rwlock_init(&newsk->sk_callback_lock);
1477                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1478                                 af_callback_keys + newsk->sk_family,
1479                                 af_family_clock_key_strings[newsk->sk_family]);
1480
1481                 newsk->sk_dst_cache     = NULL;
1482                 newsk->sk_wmem_queued   = 0;
1483                 newsk->sk_forward_alloc = 0;
1484                 newsk->sk_send_head     = NULL;
1485                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1486
1487                 sock_reset_flag(newsk, SOCK_DONE);
1488                 skb_queue_head_init(&newsk->sk_error_queue);
1489
1490                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1491                 if (filter != NULL)
1492                         sk_filter_charge(newsk, filter);
1493
1494                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1495                         /* It is still raw copy of parent, so invalidate
1496                          * destructor and make plain sk_free() */
1497                         newsk->sk_destruct = NULL;
1498                         bh_unlock_sock(newsk);
1499                         sk_free(newsk);
1500                         newsk = NULL;
1501                         goto out;
1502                 }
1503
1504                 newsk->sk_err      = 0;
1505                 newsk->sk_priority = 0;
1506                 /*
1507                  * Before updating sk_refcnt, we must commit prior changes to memory
1508                  * (Documentation/RCU/rculist_nulls.txt for details)
1509                  */
1510                 smp_wmb();
1511                 atomic_set(&newsk->sk_refcnt, 2);
1512
1513                 /*
1514                  * Increment the counter in the same struct proto as the master
1515                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1516                  * is the same as sk->sk_prot->socks, as this field was copied
1517                  * with memcpy).
1518                  *
1519                  * This _changes_ the previous behaviour, where
1520                  * tcp_create_openreq_child always was incrementing the
1521                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1522                  * to be taken into account in all callers. -acme
1523                  */
1524                 sk_refcnt_debug_inc(newsk);
1525                 sk_set_socket(newsk, NULL);
1526                 newsk->sk_wq = NULL;
1527
1528                 sk_update_clone(sk, newsk);
1529
1530                 if (newsk->sk_prot->sockets_allocated)
1531                         sk_sockets_allocated_inc(newsk);
1532
1533                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1534                         net_enable_timestamp();
1535         }
1536 out:
1537         return newsk;
1538 }
1539 EXPORT_SYMBOL_GPL(sk_clone_lock);
1540
1541 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1542 {
1543         __sk_dst_set(sk, dst);
1544         sk->sk_route_caps = dst->dev->features;
1545         if (sk->sk_route_caps & NETIF_F_GSO)
1546                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1547         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1548         if (sk_can_gso(sk)) {
1549                 if (dst->header_len) {
1550                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1551                 } else {
1552                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1553                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1554                         sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1555                 }
1556         }
1557 }
1558 EXPORT_SYMBOL_GPL(sk_setup_caps);
1559
1560 /*
1561  *      Simple resource managers for sockets.
1562  */
1563
1564
1565 /*
1566  * Write buffer destructor automatically called from kfree_skb.
1567  */
1568 void sock_wfree(struct sk_buff *skb)
1569 {
1570         struct sock *sk = skb->sk;
1571         unsigned int len = skb->truesize;
1572
1573         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1574                 /*
1575                  * Keep a reference on sk_wmem_alloc, this will be released
1576                  * after sk_write_space() call
1577                  */
1578                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1579                 sk->sk_write_space(sk);
1580                 len = 1;
1581         }
1582         /*
1583          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1584          * could not do because of in-flight packets
1585          */
1586         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1587                 __sk_free(sk);
1588 }
1589 EXPORT_SYMBOL(sock_wfree);
1590
1591 /*
1592  * Read buffer destructor automatically called from kfree_skb.
1593  */
1594 void sock_rfree(struct sk_buff *skb)
1595 {
1596         struct sock *sk = skb->sk;
1597         unsigned int len = skb->truesize;
1598
1599         atomic_sub(len, &sk->sk_rmem_alloc);
1600         sk_mem_uncharge(sk, len);
1601 }
1602 EXPORT_SYMBOL(sock_rfree);
1603
1604 void sock_edemux(struct sk_buff *skb)
1605 {
1606         struct sock *sk = skb->sk;
1607
1608 #ifdef CONFIG_INET
1609         if (sk->sk_state == TCP_TIME_WAIT)
1610                 inet_twsk_put(inet_twsk(sk));
1611         else
1612 #endif
1613                 sock_put(sk);
1614 }
1615 EXPORT_SYMBOL(sock_edemux);
1616
1617 kuid_t sock_i_uid(struct sock *sk)
1618 {
1619         kuid_t uid;
1620
1621         read_lock_bh(&sk->sk_callback_lock);
1622         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1623         read_unlock_bh(&sk->sk_callback_lock);
1624         return uid;
1625 }
1626 EXPORT_SYMBOL(sock_i_uid);
1627
1628 unsigned long sock_i_ino(struct sock *sk)
1629 {
1630         unsigned long ino;
1631
1632         read_lock_bh(&sk->sk_callback_lock);
1633         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1634         read_unlock_bh(&sk->sk_callback_lock);
1635         return ino;
1636 }
1637 EXPORT_SYMBOL(sock_i_ino);
1638
1639 /*
1640  * Allocate a skb from the socket's send buffer.
1641  */
1642 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1643                              gfp_t priority)
1644 {
1645         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1646                 struct sk_buff *skb = alloc_skb(size, priority);
1647                 if (skb) {
1648                         skb_set_owner_w(skb, sk);
1649                         return skb;
1650                 }
1651         }
1652         return NULL;
1653 }
1654 EXPORT_SYMBOL(sock_wmalloc);
1655
1656 /*
1657  * Allocate a skb from the socket's receive buffer.
1658  */
1659 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1660                              gfp_t priority)
1661 {
1662         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1663                 struct sk_buff *skb = alloc_skb(size, priority);
1664                 if (skb) {
1665                         skb_set_owner_r(skb, sk);
1666                         return skb;
1667                 }
1668         }
1669         return NULL;
1670 }
1671
1672 /*
1673  * Allocate a memory block from the socket's option memory buffer.
1674  */
1675 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1676 {
1677         if ((unsigned int)size <= sysctl_optmem_max &&
1678             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1679                 void *mem;
1680                 /* First do the add, to avoid the race if kmalloc
1681                  * might sleep.
1682                  */
1683                 atomic_add(size, &sk->sk_omem_alloc);
1684                 mem = kmalloc(size, priority);
1685                 if (mem)
1686                         return mem;
1687                 atomic_sub(size, &sk->sk_omem_alloc);
1688         }
1689         return NULL;
1690 }
1691 EXPORT_SYMBOL(sock_kmalloc);
1692
1693 /*
1694  * Free an option memory block.
1695  */
1696 void sock_kfree_s(struct sock *sk, void *mem, int size)
1697 {
1698         kfree(mem);
1699         atomic_sub(size, &sk->sk_omem_alloc);
1700 }
1701 EXPORT_SYMBOL(sock_kfree_s);
1702
1703 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1704    I think, these locks should be removed for datagram sockets.
1705  */
1706 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1707 {
1708         DEFINE_WAIT(wait);
1709
1710         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1711         for (;;) {
1712                 if (!timeo)
1713                         break;
1714                 if (signal_pending(current))
1715                         break;
1716                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1717                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1718                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1719                         break;
1720                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1721                         break;
1722                 if (sk->sk_err)
1723                         break;
1724                 timeo = schedule_timeout(timeo);
1725         }
1726         finish_wait(sk_sleep(sk), &wait);
1727         return timeo;
1728 }
1729
1730
1731 /*
1732  *      Generic send/receive buffer handlers
1733  */
1734
1735 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1736                                      unsigned long data_len, int noblock,
1737                                      int *errcode)
1738 {
1739         struct sk_buff *skb;
1740         gfp_t gfp_mask;
1741         long timeo;
1742         int err;
1743         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1744
1745         err = -EMSGSIZE;
1746         if (npages > MAX_SKB_FRAGS)
1747                 goto failure;
1748
1749         gfp_mask = sk->sk_allocation;
1750         if (gfp_mask & __GFP_WAIT)
1751                 gfp_mask |= __GFP_REPEAT;
1752
1753         timeo = sock_sndtimeo(sk, noblock);
1754         while (1) {
1755                 err = sock_error(sk);
1756                 if (err != 0)
1757                         goto failure;
1758
1759                 err = -EPIPE;
1760                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1761                         goto failure;
1762
1763                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1764                         skb = alloc_skb(header_len, gfp_mask);
1765                         if (skb) {
1766                                 int i;
1767
1768                                 /* No pages, we're done... */
1769                                 if (!data_len)
1770                                         break;
1771
1772                                 skb->truesize += data_len;
1773                                 skb_shinfo(skb)->nr_frags = npages;
1774                                 for (i = 0; i < npages; i++) {
1775                                         struct page *page;
1776
1777                                         page = alloc_pages(sk->sk_allocation, 0);
1778                                         if (!page) {
1779                                                 err = -ENOBUFS;
1780                                                 skb_shinfo(skb)->nr_frags = i;
1781                                                 kfree_skb(skb);
1782                                                 goto failure;
1783                                         }
1784
1785                                         __skb_fill_page_desc(skb, i,
1786                                                         page, 0,
1787                                                         (data_len >= PAGE_SIZE ?
1788                                                          PAGE_SIZE :
1789                                                          data_len));
1790                                         data_len -= PAGE_SIZE;
1791                                 }
1792
1793                                 /* Full success... */
1794                                 break;
1795                         }
1796                         err = -ENOBUFS;
1797                         goto failure;
1798                 }
1799                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1800                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1801                 err = -EAGAIN;
1802                 if (!timeo)
1803                         goto failure;
1804                 if (signal_pending(current))
1805                         goto interrupted;
1806                 timeo = sock_wait_for_wmem(sk, timeo);
1807         }
1808
1809         skb_set_owner_w(skb, sk);
1810         return skb;
1811
1812 interrupted:
1813         err = sock_intr_errno(timeo);
1814 failure:
1815         *errcode = err;
1816         return NULL;
1817 }
1818 EXPORT_SYMBOL(sock_alloc_send_pskb);
1819
1820 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1821                                     int noblock, int *errcode)
1822 {
1823         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1824 }
1825 EXPORT_SYMBOL(sock_alloc_send_skb);
1826
1827 /* On 32bit arches, an skb frag is limited to 2^15 */
1828 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1829
1830 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1831 {
1832         int order;
1833
1834         if (pfrag->page) {
1835                 if (atomic_read(&pfrag->page->_count) == 1) {
1836                         pfrag->offset = 0;
1837                         return true;
1838                 }
1839                 if (pfrag->offset < pfrag->size)
1840                         return true;
1841                 put_page(pfrag->page);
1842         }
1843
1844         /* We restrict high order allocations to users that can afford to wait */
1845         order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1846
1847         do {
1848                 gfp_t gfp = sk->sk_allocation;
1849
1850                 if (order)
1851                         gfp |= __GFP_COMP | __GFP_NOWARN;
1852                 pfrag->page = alloc_pages(gfp, order);
1853                 if (likely(pfrag->page)) {
1854                         pfrag->offset = 0;
1855                         pfrag->size = PAGE_SIZE << order;
1856                         return true;
1857                 }
1858         } while (--order >= 0);
1859
1860         sk_enter_memory_pressure(sk);
1861         sk_stream_moderate_sndbuf(sk);
1862         return false;
1863 }
1864 EXPORT_SYMBOL(sk_page_frag_refill);
1865
1866 static void __lock_sock(struct sock *sk)
1867         __releases(&sk->sk_lock.slock)
1868         __acquires(&sk->sk_lock.slock)
1869 {
1870         DEFINE_WAIT(wait);
1871
1872         for (;;) {
1873                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1874                                         TASK_UNINTERRUPTIBLE);
1875                 spin_unlock_bh(&sk->sk_lock.slock);
1876                 schedule();
1877                 spin_lock_bh(&sk->sk_lock.slock);
1878                 if (!sock_owned_by_user(sk))
1879                         break;
1880         }
1881         finish_wait(&sk->sk_lock.wq, &wait);
1882 }
1883
1884 static void __release_sock(struct sock *sk)
1885         __releases(&sk->sk_lock.slock)
1886         __acquires(&sk->sk_lock.slock)
1887 {
1888         struct sk_buff *skb = sk->sk_backlog.head;
1889
1890         do {
1891                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1892                 bh_unlock_sock(sk);
1893
1894                 do {
1895                         struct sk_buff *next = skb->next;
1896
1897                         prefetch(next);
1898                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1899                         skb->next = NULL;
1900                         sk_backlog_rcv(sk, skb);
1901
1902                         /*
1903                          * We are in process context here with softirqs
1904                          * disabled, use cond_resched_softirq() to preempt.
1905                          * This is safe to do because we've taken the backlog
1906                          * queue private:
1907                          */
1908                         cond_resched_softirq();
1909
1910                         skb = next;
1911                 } while (skb != NULL);
1912
1913                 bh_lock_sock(sk);
1914         } while ((skb = sk->sk_backlog.head) != NULL);
1915
1916         /*
1917          * Doing the zeroing here guarantee we can not loop forever
1918          * while a wild producer attempts to flood us.
1919          */
1920         sk->sk_backlog.len = 0;
1921 }
1922
1923 /**
1924  * sk_wait_data - wait for data to arrive at sk_receive_queue
1925  * @sk:    sock to wait on
1926  * @timeo: for how long
1927  *
1928  * Now socket state including sk->sk_err is changed only under lock,
1929  * hence we may omit checks after joining wait queue.
1930  * We check receive queue before schedule() only as optimization;
1931  * it is very likely that release_sock() added new data.
1932  */
1933 int sk_wait_data(struct sock *sk, long *timeo)
1934 {
1935         int rc;
1936         DEFINE_WAIT(wait);
1937
1938         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1939         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1940         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1941         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1942         finish_wait(sk_sleep(sk), &wait);
1943         return rc;
1944 }
1945 EXPORT_SYMBOL(sk_wait_data);
1946
1947 /**
1948  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1949  *      @sk: socket
1950  *      @size: memory size to allocate
1951  *      @kind: allocation type
1952  *
1953  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1954  *      rmem allocation. This function assumes that protocols which have
1955  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1956  */
1957 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1958 {
1959         struct proto *prot = sk->sk_prot;
1960         int amt = sk_mem_pages(size);
1961         long allocated;
1962         int parent_status = UNDER_LIMIT;
1963
1964         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1965
1966         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1967
1968         /* Under limit. */
1969         if (parent_status == UNDER_LIMIT &&
1970                         allocated <= sk_prot_mem_limits(sk, 0)) {
1971                 sk_leave_memory_pressure(sk);
1972                 return 1;
1973         }
1974
1975         /* Under pressure. (we or our parents) */
1976         if ((parent_status > SOFT_LIMIT) ||
1977                         allocated > sk_prot_mem_limits(sk, 1))
1978                 sk_enter_memory_pressure(sk);
1979
1980         /* Over hard limit (we or our parents) */
1981         if ((parent_status == OVER_LIMIT) ||
1982                         (allocated > sk_prot_mem_limits(sk, 2)))
1983                 goto suppress_allocation;
1984
1985         /* guarantee minimum buffer size under pressure */
1986         if (kind == SK_MEM_RECV) {
1987                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1988                         return 1;
1989
1990         } else { /* SK_MEM_SEND */
1991                 if (sk->sk_type == SOCK_STREAM) {
1992                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1993                                 return 1;
1994                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1995                            prot->sysctl_wmem[0])
1996                                 return 1;
1997         }
1998
1999         if (sk_has_memory_pressure(sk)) {
2000                 int alloc;
2001
2002                 if (!sk_under_memory_pressure(sk))
2003                         return 1;
2004                 alloc = sk_sockets_allocated_read_positive(sk);
2005                 if (sk_prot_mem_limits(sk, 2) > alloc *
2006                     sk_mem_pages(sk->sk_wmem_queued +
2007                                  atomic_read(&sk->sk_rmem_alloc) +
2008                                  sk->sk_forward_alloc))
2009                         return 1;
2010         }
2011
2012 suppress_allocation:
2013
2014         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2015                 sk_stream_moderate_sndbuf(sk);
2016
2017                 /* Fail only if socket is _under_ its sndbuf.
2018                  * In this case we cannot block, so that we have to fail.
2019                  */
2020                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2021                         return 1;
2022         }
2023
2024         trace_sock_exceed_buf_limit(sk, prot, allocated);
2025
2026         /* Alas. Undo changes. */
2027         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2028
2029         sk_memory_allocated_sub(sk, amt);
2030
2031         return 0;
2032 }
2033 EXPORT_SYMBOL(__sk_mem_schedule);
2034
2035 /**
2036  *      __sk_reclaim - reclaim memory_allocated
2037  *      @sk: socket
2038  */
2039 void __sk_mem_reclaim(struct sock *sk)
2040 {
2041         sk_memory_allocated_sub(sk,
2042                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2043         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2044
2045         if (sk_under_memory_pressure(sk) &&
2046             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2047                 sk_leave_memory_pressure(sk);
2048 }
2049 EXPORT_SYMBOL(__sk_mem_reclaim);
2050
2051
2052 /*
2053  * Set of default routines for initialising struct proto_ops when
2054  * the protocol does not support a particular function. In certain
2055  * cases where it makes no sense for a protocol to have a "do nothing"
2056  * function, some default processing is provided.
2057  */
2058
2059 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2060 {
2061         return -EOPNOTSUPP;
2062 }
2063 EXPORT_SYMBOL(sock_no_bind);
2064
2065 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2066                     int len, int flags)
2067 {
2068         return -EOPNOTSUPP;
2069 }
2070 EXPORT_SYMBOL(sock_no_connect);
2071
2072 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2073 {
2074         return -EOPNOTSUPP;
2075 }
2076 EXPORT_SYMBOL(sock_no_socketpair);
2077
2078 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2079 {
2080         return -EOPNOTSUPP;
2081 }
2082 EXPORT_SYMBOL(sock_no_accept);
2083
2084 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2085                     int *len, int peer)
2086 {
2087         return -EOPNOTSUPP;
2088 }
2089 EXPORT_SYMBOL(sock_no_getname);
2090
2091 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2092 {
2093         return 0;
2094 }
2095 EXPORT_SYMBOL(sock_no_poll);
2096
2097 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2098 {
2099         return -EOPNOTSUPP;
2100 }
2101 EXPORT_SYMBOL(sock_no_ioctl);
2102
2103 int sock_no_listen(struct socket *sock, int backlog)
2104 {
2105         return -EOPNOTSUPP;
2106 }
2107 EXPORT_SYMBOL(sock_no_listen);
2108
2109 int sock_no_shutdown(struct socket *sock, int how)
2110 {
2111         return -EOPNOTSUPP;
2112 }
2113 EXPORT_SYMBOL(sock_no_shutdown);
2114
2115 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2116                     char __user *optval, unsigned int optlen)
2117 {
2118         return -EOPNOTSUPP;
2119 }
2120 EXPORT_SYMBOL(sock_no_setsockopt);
2121
2122 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2123                     char __user *optval, int __user *optlen)
2124 {
2125         return -EOPNOTSUPP;
2126 }
2127 EXPORT_SYMBOL(sock_no_getsockopt);
2128
2129 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2130                     size_t len)
2131 {
2132         return -EOPNOTSUPP;
2133 }
2134 EXPORT_SYMBOL(sock_no_sendmsg);
2135
2136 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2137                     size_t len, int flags)
2138 {
2139         return -EOPNOTSUPP;
2140 }
2141 EXPORT_SYMBOL(sock_no_recvmsg);
2142
2143 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2144 {
2145         /* Mirror missing mmap method error code */
2146         return -ENODEV;
2147 }
2148 EXPORT_SYMBOL(sock_no_mmap);
2149
2150 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2151 {
2152         ssize_t res;
2153         struct msghdr msg = {.msg_flags = flags};
2154         struct kvec iov;
2155         char *kaddr = kmap(page);
2156         iov.iov_base = kaddr + offset;
2157         iov.iov_len = size;
2158         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2159         kunmap(page);
2160         return res;
2161 }
2162 EXPORT_SYMBOL(sock_no_sendpage);
2163
2164 /*
2165  *      Default Socket Callbacks
2166  */
2167
2168 static void sock_def_wakeup(struct sock *sk)
2169 {
2170         struct socket_wq *wq;
2171
2172         rcu_read_lock();
2173         wq = rcu_dereference(sk->sk_wq);
2174         if (wq_has_sleeper(wq))
2175                 wake_up_interruptible_all(&wq->wait);
2176         rcu_read_unlock();
2177 }
2178
2179 static void sock_def_error_report(struct sock *sk)
2180 {
2181         struct socket_wq *wq;
2182
2183         rcu_read_lock();
2184         wq = rcu_dereference(sk->sk_wq);
2185         if (wq_has_sleeper(wq))
2186                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2187         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2188         rcu_read_unlock();
2189 }
2190
2191 static void sock_def_readable(struct sock *sk, int len)
2192 {
2193         struct socket_wq *wq;
2194
2195         rcu_read_lock();
2196         wq = rcu_dereference(sk->sk_wq);
2197         if (wq_has_sleeper(wq))
2198                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2199                                                 POLLRDNORM | POLLRDBAND);
2200         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2201         rcu_read_unlock();
2202 }
2203
2204 static void sock_def_write_space(struct sock *sk)
2205 {
2206         struct socket_wq *wq;
2207
2208         rcu_read_lock();
2209
2210         /* Do not wake up a writer until he can make "significant"
2211          * progress.  --DaveM
2212          */
2213         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2214                 wq = rcu_dereference(sk->sk_wq);
2215                 if (wq_has_sleeper(wq))
2216                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2217                                                 POLLWRNORM | POLLWRBAND);
2218
2219                 /* Should agree with poll, otherwise some programs break */
2220                 if (sock_writeable(sk))
2221                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2222         }
2223
2224         rcu_read_unlock();
2225 }
2226
2227 static void sock_def_destruct(struct sock *sk)
2228 {
2229         kfree(sk->sk_protinfo);
2230 }
2231
2232 void sk_send_sigurg(struct sock *sk)
2233 {
2234         if (sk->sk_socket && sk->sk_socket->file)
2235                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2236                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2237 }
2238 EXPORT_SYMBOL(sk_send_sigurg);
2239
2240 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2241                     unsigned long expires)
2242 {
2243         if (!mod_timer(timer, expires))
2244                 sock_hold(sk);
2245 }
2246 EXPORT_SYMBOL(sk_reset_timer);
2247
2248 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2249 {
2250         if (del_timer(timer))
2251                 __sock_put(sk);
2252 }
2253 EXPORT_SYMBOL(sk_stop_timer);
2254
2255 void sock_init_data(struct socket *sock, struct sock *sk)
2256 {
2257         skb_queue_head_init(&sk->sk_receive_queue);
2258         skb_queue_head_init(&sk->sk_write_queue);
2259         skb_queue_head_init(&sk->sk_error_queue);
2260 #ifdef CONFIG_NET_DMA
2261         skb_queue_head_init(&sk->sk_async_wait_queue);
2262 #endif
2263
2264         sk->sk_send_head        =       NULL;
2265
2266         init_timer(&sk->sk_timer);
2267
2268         sk->sk_allocation       =       GFP_KERNEL;
2269         sk->sk_rcvbuf           =       sysctl_rmem_default;
2270         sk->sk_sndbuf           =       sysctl_wmem_default;
2271         sk->sk_state            =       TCP_CLOSE;
2272         sk_set_socket(sk, sock);
2273
2274         sock_set_flag(sk, SOCK_ZAPPED);
2275
2276         if (sock) {
2277                 sk->sk_type     =       sock->type;
2278                 sk->sk_wq       =       sock->wq;
2279                 sock->sk        =       sk;
2280         } else
2281                 sk->sk_wq       =       NULL;
2282
2283         spin_lock_init(&sk->sk_dst_lock);
2284         rwlock_init(&sk->sk_callback_lock);
2285         lockdep_set_class_and_name(&sk->sk_callback_lock,
2286                         af_callback_keys + sk->sk_family,
2287                         af_family_clock_key_strings[sk->sk_family]);
2288
2289         sk->sk_state_change     =       sock_def_wakeup;
2290         sk->sk_data_ready       =       sock_def_readable;
2291         sk->sk_write_space      =       sock_def_write_space;
2292         sk->sk_error_report     =       sock_def_error_report;
2293         sk->sk_destruct         =       sock_def_destruct;
2294
2295         sk->sk_frag.page        =       NULL;
2296         sk->sk_frag.offset      =       0;
2297         sk->sk_peek_off         =       -1;
2298
2299         sk->sk_peer_pid         =       NULL;
2300         sk->sk_peer_cred        =       NULL;
2301         sk->sk_write_pending    =       0;
2302         sk->sk_rcvlowat         =       1;
2303         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2304         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2305
2306         sk->sk_stamp = ktime_set(-1L, 0);
2307
2308 #ifdef CONFIG_NET_LL_RX_POLL
2309         sk->sk_napi_id          =       0;
2310         sk->sk_ll_usec          =       sysctl_net_ll_read;
2311 #endif
2312
2313         /*
2314          * Before updating sk_refcnt, we must commit prior changes to memory
2315          * (Documentation/RCU/rculist_nulls.txt for details)
2316          */
2317         smp_wmb();
2318         atomic_set(&sk->sk_refcnt, 1);
2319         atomic_set(&sk->sk_drops, 0);
2320 }
2321 EXPORT_SYMBOL(sock_init_data);
2322
2323 void lock_sock_nested(struct sock *sk, int subclass)
2324 {
2325         might_sleep();
2326         spin_lock_bh(&sk->sk_lock.slock);
2327         if (sk->sk_lock.owned)
2328                 __lock_sock(sk);
2329         sk->sk_lock.owned = 1;
2330         spin_unlock(&sk->sk_lock.slock);
2331         /*
2332          * The sk_lock has mutex_lock() semantics here:
2333          */
2334         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2335         local_bh_enable();
2336 }
2337 EXPORT_SYMBOL(lock_sock_nested);
2338
2339 void release_sock(struct sock *sk)
2340 {
2341         /*
2342          * The sk_lock has mutex_unlock() semantics:
2343          */
2344         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2345
2346         spin_lock_bh(&sk->sk_lock.slock);
2347         if (sk->sk_backlog.tail)
2348                 __release_sock(sk);
2349
2350         if (sk->sk_prot->release_cb)
2351                 sk->sk_prot->release_cb(sk);
2352
2353         sk->sk_lock.owned = 0;
2354         if (waitqueue_active(&sk->sk_lock.wq))
2355                 wake_up(&sk->sk_lock.wq);
2356         spin_unlock_bh(&sk->sk_lock.slock);
2357 }
2358 EXPORT_SYMBOL(release_sock);
2359
2360 /**
2361  * lock_sock_fast - fast version of lock_sock
2362  * @sk: socket
2363  *
2364  * This version should be used for very small section, where process wont block
2365  * return false if fast path is taken
2366  *   sk_lock.slock locked, owned = 0, BH disabled
2367  * return true if slow path is taken
2368  *   sk_lock.slock unlocked, owned = 1, BH enabled
2369  */
2370 bool lock_sock_fast(struct sock *sk)
2371 {
2372         might_sleep();
2373         spin_lock_bh(&sk->sk_lock.slock);
2374
2375         if (!sk->sk_lock.owned)
2376                 /*
2377                  * Note : We must disable BH
2378                  */
2379                 return false;
2380
2381         __lock_sock(sk);
2382         sk->sk_lock.owned = 1;
2383         spin_unlock(&sk->sk_lock.slock);
2384         /*
2385          * The sk_lock has mutex_lock() semantics here:
2386          */
2387         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2388         local_bh_enable();
2389         return true;
2390 }
2391 EXPORT_SYMBOL(lock_sock_fast);
2392
2393 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2394 {
2395         struct timeval tv;
2396         if (!sock_flag(sk, SOCK_TIMESTAMP))
2397                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2398         tv = ktime_to_timeval(sk->sk_stamp);
2399         if (tv.tv_sec == -1)
2400                 return -ENOENT;
2401         if (tv.tv_sec == 0) {
2402                 sk->sk_stamp = ktime_get_real();
2403                 tv = ktime_to_timeval(sk->sk_stamp);
2404         }
2405         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2406 }
2407 EXPORT_SYMBOL(sock_get_timestamp);
2408
2409 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2410 {
2411         struct timespec ts;
2412         if (!sock_flag(sk, SOCK_TIMESTAMP))
2413                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2414         ts = ktime_to_timespec(sk->sk_stamp);
2415         if (ts.tv_sec == -1)
2416                 return -ENOENT;
2417         if (ts.tv_sec == 0) {
2418                 sk->sk_stamp = ktime_get_real();
2419                 ts = ktime_to_timespec(sk->sk_stamp);
2420         }
2421         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2422 }
2423 EXPORT_SYMBOL(sock_get_timestampns);
2424
2425 void sock_enable_timestamp(struct sock *sk, int flag)
2426 {
2427         if (!sock_flag(sk, flag)) {
2428                 unsigned long previous_flags = sk->sk_flags;
2429
2430                 sock_set_flag(sk, flag);
2431                 /*
2432                  * we just set one of the two flags which require net
2433                  * time stamping, but time stamping might have been on
2434                  * already because of the other one
2435                  */
2436                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2437                         net_enable_timestamp();
2438         }
2439 }
2440
2441 /*
2442  *      Get a socket option on an socket.
2443  *
2444  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2445  *      asynchronous errors should be reported by getsockopt. We assume
2446  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2447  */
2448 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2449                            char __user *optval, int __user *optlen)
2450 {
2451         struct sock *sk = sock->sk;
2452
2453         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2454 }
2455 EXPORT_SYMBOL(sock_common_getsockopt);
2456
2457 #ifdef CONFIG_COMPAT
2458 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2459                                   char __user *optval, int __user *optlen)
2460 {
2461         struct sock *sk = sock->sk;
2462
2463         if (sk->sk_prot->compat_getsockopt != NULL)
2464                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2465                                                       optval, optlen);
2466         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2467 }
2468 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2469 #endif
2470
2471 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2472                         struct msghdr *msg, size_t size, int flags)
2473 {
2474         struct sock *sk = sock->sk;
2475         int addr_len = 0;
2476         int err;
2477
2478         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2479                                    flags & ~MSG_DONTWAIT, &addr_len);
2480         if (err >= 0)
2481                 msg->msg_namelen = addr_len;
2482         return err;
2483 }
2484 EXPORT_SYMBOL(sock_common_recvmsg);
2485
2486 /*
2487  *      Set socket options on an inet socket.
2488  */
2489 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2490                            char __user *optval, unsigned int optlen)
2491 {
2492         struct sock *sk = sock->sk;
2493
2494         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2495 }
2496 EXPORT_SYMBOL(sock_common_setsockopt);
2497
2498 #ifdef CONFIG_COMPAT
2499 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2500                                   char __user *optval, unsigned int optlen)
2501 {
2502         struct sock *sk = sock->sk;
2503
2504         if (sk->sk_prot->compat_setsockopt != NULL)
2505                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2506                                                       optval, optlen);
2507         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2508 }
2509 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2510 #endif
2511
2512 void sk_common_release(struct sock *sk)
2513 {
2514         if (sk->sk_prot->destroy)
2515                 sk->sk_prot->destroy(sk);
2516
2517         /*
2518          * Observation: when sock_common_release is called, processes have
2519          * no access to socket. But net still has.
2520          * Step one, detach it from networking:
2521          *
2522          * A. Remove from hash tables.
2523          */
2524
2525         sk->sk_prot->unhash(sk);
2526
2527         /*
2528          * In this point socket cannot receive new packets, but it is possible
2529          * that some packets are in flight because some CPU runs receiver and
2530          * did hash table lookup before we unhashed socket. They will achieve
2531          * receive queue and will be purged by socket destructor.
2532          *
2533          * Also we still have packets pending on receive queue and probably,
2534          * our own packets waiting in device queues. sock_destroy will drain
2535          * receive queue, but transmitted packets will delay socket destruction
2536          * until the last reference will be released.
2537          */
2538
2539         sock_orphan(sk);
2540
2541         xfrm_sk_free_policy(sk);
2542
2543         sk_refcnt_debug_release(sk);
2544
2545         if (sk->sk_frag.page) {
2546                 put_page(sk->sk_frag.page);
2547                 sk->sk_frag.page = NULL;
2548         }
2549
2550         sock_put(sk);
2551 }
2552 EXPORT_SYMBOL(sk_common_release);
2553
2554 #ifdef CONFIG_PROC_FS
2555 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2556 struct prot_inuse {
2557         int val[PROTO_INUSE_NR];
2558 };
2559
2560 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2561
2562 #ifdef CONFIG_NET_NS
2563 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2564 {
2565         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2566 }
2567 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2568
2569 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2570 {
2571         int cpu, idx = prot->inuse_idx;
2572         int res = 0;
2573
2574         for_each_possible_cpu(cpu)
2575                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2576
2577         return res >= 0 ? res : 0;
2578 }
2579 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2580
2581 static int __net_init sock_inuse_init_net(struct net *net)
2582 {
2583         net->core.inuse = alloc_percpu(struct prot_inuse);
2584         return net->core.inuse ? 0 : -ENOMEM;
2585 }
2586
2587 static void __net_exit sock_inuse_exit_net(struct net *net)
2588 {
2589         free_percpu(net->core.inuse);
2590 }
2591
2592 static struct pernet_operations net_inuse_ops = {
2593         .init = sock_inuse_init_net,
2594         .exit = sock_inuse_exit_net,
2595 };
2596
2597 static __init int net_inuse_init(void)
2598 {
2599         if (register_pernet_subsys(&net_inuse_ops))
2600                 panic("Cannot initialize net inuse counters");
2601
2602         return 0;
2603 }
2604
2605 core_initcall(net_inuse_init);
2606 #else
2607 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2608
2609 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2610 {
2611         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2612 }
2613 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2614
2615 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2616 {
2617         int cpu, idx = prot->inuse_idx;
2618         int res = 0;
2619
2620         for_each_possible_cpu(cpu)
2621                 res += per_cpu(prot_inuse, cpu).val[idx];
2622
2623         return res >= 0 ? res : 0;
2624 }
2625 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2626 #endif
2627
2628 static void assign_proto_idx(struct proto *prot)
2629 {
2630         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2631
2632         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2633                 pr_err("PROTO_INUSE_NR exhausted\n");
2634                 return;
2635         }
2636
2637         set_bit(prot->inuse_idx, proto_inuse_idx);
2638 }
2639
2640 static void release_proto_idx(struct proto *prot)
2641 {
2642         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2643                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2644 }
2645 #else
2646 static inline void assign_proto_idx(struct proto *prot)
2647 {
2648 }
2649
2650 static inline void release_proto_idx(struct proto *prot)
2651 {
2652 }
2653 #endif
2654
2655 int proto_register(struct proto *prot, int alloc_slab)
2656 {
2657         if (alloc_slab) {
2658                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2659                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2660                                         NULL);
2661
2662                 if (prot->slab == NULL) {
2663                         pr_crit("%s: Can't create sock SLAB cache!\n",
2664                                 prot->name);
2665                         goto out;
2666                 }
2667
2668                 if (prot->rsk_prot != NULL) {
2669                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2670                         if (prot->rsk_prot->slab_name == NULL)
2671                                 goto out_free_sock_slab;
2672
2673                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2674                                                                  prot->rsk_prot->obj_size, 0,
2675                                                                  SLAB_HWCACHE_ALIGN, NULL);
2676
2677                         if (prot->rsk_prot->slab == NULL) {
2678                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2679                                         prot->name);
2680                                 goto out_free_request_sock_slab_name;
2681                         }
2682                 }
2683
2684                 if (prot->twsk_prot != NULL) {
2685                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2686
2687                         if (prot->twsk_prot->twsk_slab_name == NULL)
2688                                 goto out_free_request_sock_slab;
2689
2690                         prot->twsk_prot->twsk_slab =
2691                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2692                                                   prot->twsk_prot->twsk_obj_size,
2693                                                   0,
2694                                                   SLAB_HWCACHE_ALIGN |
2695                                                         prot->slab_flags,
2696                                                   NULL);
2697                         if (prot->twsk_prot->twsk_slab == NULL)
2698                                 goto out_free_timewait_sock_slab_name;
2699                 }
2700         }
2701
2702         mutex_lock(&proto_list_mutex);
2703         list_add(&prot->node, &proto_list);
2704         assign_proto_idx(prot);
2705         mutex_unlock(&proto_list_mutex);
2706         return 0;
2707
2708 out_free_timewait_sock_slab_name:
2709         kfree(prot->twsk_prot->twsk_slab_name);
2710 out_free_request_sock_slab:
2711         if (prot->rsk_prot && prot->rsk_prot->slab) {
2712                 kmem_cache_destroy(prot->rsk_prot->slab);
2713                 prot->rsk_prot->slab = NULL;
2714         }
2715 out_free_request_sock_slab_name:
2716         if (prot->rsk_prot)
2717                 kfree(prot->rsk_prot->slab_name);
2718 out_free_sock_slab:
2719         kmem_cache_destroy(prot->slab);
2720         prot->slab = NULL;
2721 out:
2722         return -ENOBUFS;
2723 }
2724 EXPORT_SYMBOL(proto_register);
2725
2726 void proto_unregister(struct proto *prot)
2727 {
2728         mutex_lock(&proto_list_mutex);
2729         release_proto_idx(prot);
2730         list_del(&prot->node);
2731         mutex_unlock(&proto_list_mutex);
2732
2733         if (prot->slab != NULL) {
2734                 kmem_cache_destroy(prot->slab);
2735                 prot->slab = NULL;
2736         }
2737
2738         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2739                 kmem_cache_destroy(prot->rsk_prot->slab);
2740                 kfree(prot->rsk_prot->slab_name);
2741                 prot->rsk_prot->slab = NULL;
2742         }
2743
2744         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2745                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2746                 kfree(prot->twsk_prot->twsk_slab_name);
2747                 prot->twsk_prot->twsk_slab = NULL;
2748         }
2749 }
2750 EXPORT_SYMBOL(proto_unregister);
2751
2752 #ifdef CONFIG_PROC_FS
2753 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2754         __acquires(proto_list_mutex)
2755 {
2756         mutex_lock(&proto_list_mutex);
2757         return seq_list_start_head(&proto_list, *pos);
2758 }
2759
2760 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2761 {
2762         return seq_list_next(v, &proto_list, pos);
2763 }
2764
2765 static void proto_seq_stop(struct seq_file *seq, void *v)
2766         __releases(proto_list_mutex)
2767 {
2768         mutex_unlock(&proto_list_mutex);
2769 }
2770
2771 static char proto_method_implemented(const void *method)
2772 {
2773         return method == NULL ? 'n' : 'y';
2774 }
2775 static long sock_prot_memory_allocated(struct proto *proto)
2776 {
2777         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2778 }
2779
2780 static char *sock_prot_memory_pressure(struct proto *proto)
2781 {
2782         return proto->memory_pressure != NULL ?
2783         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2784 }
2785
2786 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2787 {
2788
2789         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2790                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2791                    proto->name,
2792                    proto->obj_size,
2793                    sock_prot_inuse_get(seq_file_net(seq), proto),
2794                    sock_prot_memory_allocated(proto),
2795                    sock_prot_memory_pressure(proto),
2796                    proto->max_header,
2797                    proto->slab == NULL ? "no" : "yes",
2798                    module_name(proto->owner),
2799                    proto_method_implemented(proto->close),
2800                    proto_method_implemented(proto->connect),
2801                    proto_method_implemented(proto->disconnect),
2802                    proto_method_implemented(proto->accept),
2803                    proto_method_implemented(proto->ioctl),
2804                    proto_method_implemented(proto->init),
2805                    proto_method_implemented(proto->destroy),
2806                    proto_method_implemented(proto->shutdown),
2807                    proto_method_implemented(proto->setsockopt),
2808                    proto_method_implemented(proto->getsockopt),
2809                    proto_method_implemented(proto->sendmsg),
2810                    proto_method_implemented(proto->recvmsg),
2811                    proto_method_implemented(proto->sendpage),
2812                    proto_method_implemented(proto->bind),
2813                    proto_method_implemented(proto->backlog_rcv),
2814                    proto_method_implemented(proto->hash),
2815                    proto_method_implemented(proto->unhash),
2816                    proto_method_implemented(proto->get_port),
2817                    proto_method_implemented(proto->enter_memory_pressure));
2818 }
2819
2820 static int proto_seq_show(struct seq_file *seq, void *v)
2821 {
2822         if (v == &proto_list)
2823                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2824                            "protocol",
2825                            "size",
2826                            "sockets",
2827                            "memory",
2828                            "press",
2829                            "maxhdr",
2830                            "slab",
2831                            "module",
2832                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2833         else
2834                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2835         return 0;
2836 }
2837
2838 static const struct seq_operations proto_seq_ops = {
2839         .start  = proto_seq_start,
2840         .next   = proto_seq_next,
2841         .stop   = proto_seq_stop,
2842         .show   = proto_seq_show,
2843 };
2844
2845 static int proto_seq_open(struct inode *inode, struct file *file)
2846 {
2847         return seq_open_net(inode, file, &proto_seq_ops,
2848                             sizeof(struct seq_net_private));
2849 }
2850
2851 static const struct file_operations proto_seq_fops = {
2852         .owner          = THIS_MODULE,
2853         .open           = proto_seq_open,
2854         .read           = seq_read,
2855         .llseek         = seq_lseek,
2856         .release        = seq_release_net,
2857 };
2858
2859 static __net_init int proto_init_net(struct net *net)
2860 {
2861         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2862                 return -ENOMEM;
2863
2864         return 0;
2865 }
2866
2867 static __net_exit void proto_exit_net(struct net *net)
2868 {
2869         remove_proc_entry("protocols", net->proc_net);
2870 }
2871
2872
2873 static __net_initdata struct pernet_operations proto_net_ops = {
2874         .init = proto_init_net,
2875         .exit = proto_exit_net,
2876 };
2877
2878 static int __init proto_init(void)
2879 {
2880         return register_pernet_subsys(&proto_net_ops);
2881 }
2882
2883 subsys_initcall(proto_init);
2884
2885 #endif /* PROC_FS */